Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
/* Implemented from the specification included in the Intel C++ Compiler
11
   User Guide and Reference, version 9.0.  */
12
 
13
#ifndef NO_WARN_X86_INTRINSICS
14
/* This header file is to help porting code using Intel intrinsics
15
   explicitly from x86_64 to powerpc64/powerpc64le.
16
 
17
   Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18
   PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19
   However scalar float operations in vector (XMM) registers require
20
   the POWER8 VSX ISA (2.07) level. There are differences for data
21
   format and placement of float scalars in the vector register, which
22
   require extra steps to match SSE2 scalar float semantics on POWER.
23
 
24
   It should be noted that there's much difference between X86_64's
25
   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26
   portable <fenv.h> instead of access MXSCR directly.
27
 
28
   Most SSE2 scalar float intrinsic operations can be performed more
29
   efficiently as C language float scalar operations or optimized to
30
   use vector SIMD operations. We recommend this for new applications.
31
*/
32
#error                                                                         \
33
    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
34
#endif
35
 
36
#ifndef EMMINTRIN_H_
37
#define EMMINTRIN_H_
38
 
39
#if defined(__powerpc64__) &&                                                  \
40
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41
 
42
#include <altivec.h>
43
 
44
/* We need definitions from the SSE header files.  */
45
#include <xmmintrin.h>
46
 
47
/* SSE2 */
48
typedef __vector double __v2df;
49
typedef __vector long long __v2di;
50
typedef __vector unsigned long long __v2du;
51
typedef __vector int __v4si;
52
typedef __vector unsigned int __v4su;
53
typedef __vector short __v8hi;
54
typedef __vector unsigned short __v8hu;
55
typedef __vector signed char __v16qi;
56
typedef __vector unsigned char __v16qu;
57
 
58
/* The Intel API is flexible enough that we must allow aliasing with other
59
   vector types, and their scalar components.  */
60
typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
61
typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
62
 
63
/* Unaligned version of the same types.  */
64
typedef long long __m128i_u
65
    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
66
typedef double __m128d_u
67
    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
68
 
69
/* Define two value permute mask.  */
70
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
71
 
72
/* Create a vector with element 0 as F and the rest zero.  */
73
extern __inline __m128d
74
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75
    _mm_set_sd(double __F) {
76
  return __extension__(__m128d){__F, 0.0};
77
}
78
 
79
/* Create a vector with both elements equal to F.  */
80
extern __inline __m128d
81
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
82
    _mm_set1_pd(double __F) {
83
  return __extension__(__m128d){__F, __F};
84
}
85
 
86
extern __inline __m128d
87
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88
    _mm_set_pd1(double __F) {
89
  return _mm_set1_pd(__F);
90
}
91
 
92
/* Create a vector with the lower value X and upper value W.  */
93
extern __inline __m128d
94
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95
    _mm_set_pd(double __W, double __X) {
96
  return __extension__(__m128d){__X, __W};
97
}
98
 
99
/* Create a vector with the lower value W and upper value X.  */
100
extern __inline __m128d
101
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102
    _mm_setr_pd(double __W, double __X) {
103
  return __extension__(__m128d){__W, __X};
104
}
105
 
106
/* Create an undefined vector.  */
107
extern __inline __m128d
108
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
109
    _mm_undefined_pd(void) {
110
  __m128d __Y = __Y;
111
  return __Y;
112
}
113
 
114
/* Create a vector of zeros.  */
115
extern __inline __m128d
116
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117
    _mm_setzero_pd(void) {
118
  return (__m128d)vec_splats(0);
119
}
120
 
121
/* Sets the low DPFP value of A from the low value of B.  */
122
extern __inline __m128d
123
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124
    _mm_move_sd(__m128d __A, __m128d __B) {
125
  __v2df __result = (__v2df)__A;
126
  __result[0] = ((__v2df)__B)[0];
127
  return (__m128d)__result;
128
}
129
 
130
/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
131
extern __inline __m128d
132
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133
    _mm_load_pd(double const *__P) {
134
  return ((__m128d)vec_ld(0, (__v16qu *)__P));
135
}
136
 
137
/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
138
extern __inline __m128d
139
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140
    _mm_loadu_pd(double const *__P) {
141
  return (vec_vsx_ld(0, __P));
142
}
143
 
144
/* Create a vector with all two elements equal to *P.  */
145
extern __inline __m128d
146
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147
    _mm_load1_pd(double const *__P) {
148
  return (vec_splats(*__P));
149
}
150
 
151
/* Create a vector with element 0 as *P and the rest zero.  */
152
extern __inline __m128d
153
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154
    _mm_load_sd(double const *__P) {
155
  return _mm_set_sd(*__P);
156
}
157
 
158
extern __inline __m128d
159
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160
    _mm_load_pd1(double const *__P) {
161
  return _mm_load1_pd(__P);
162
}
163
 
164
/* Load two DPFP values in reverse order.  The address must be aligned.  */
165
extern __inline __m128d
166
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167
    _mm_loadr_pd(double const *__P) {
168
  __v2df __tmp = _mm_load_pd(__P);
169
  return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
170
}
171
 
172
/* Store two DPFP values.  The address must be 16-byte aligned.  */
173
extern __inline void
174
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175
    _mm_store_pd(double *__P, __m128d __A) {
176
  vec_st((__v16qu)__A, 0, (__v16qu *)__P);
177
}
178
 
179
/* Store two DPFP values.  The address need not be 16-byte aligned.  */
180
extern __inline void
181
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182
    _mm_storeu_pd(double *__P, __m128d __A) {
183
  *(__m128d_u *)__P = __A;
184
}
185
 
186
/* Stores the lower DPFP value.  */
187
extern __inline void
188
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189
    _mm_store_sd(double *__P, __m128d __A) {
190
  *__P = ((__v2df)__A)[0];
191
}
192
 
193
extern __inline double
194
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195
    _mm_cvtsd_f64(__m128d __A) {
196
  return ((__v2df)__A)[0];
197
}
198
 
199
extern __inline void
200
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201
    _mm_storel_pd(double *__P, __m128d __A) {
202
  _mm_store_sd(__P, __A);
203
}
204
 
205
/* Stores the upper DPFP value.  */
206
extern __inline void
207
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208
    _mm_storeh_pd(double *__P, __m128d __A) {
209
  *__P = ((__v2df)__A)[1];
210
}
211
/* Store the lower DPFP value across two words.
212
   The address must be 16-byte aligned.  */
213
extern __inline void
214
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215
    _mm_store1_pd(double *__P, __m128d __A) {
216
  _mm_store_pd(__P, vec_splat(__A, 0));
217
}
218
 
219
extern __inline void
220
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221
    _mm_store_pd1(double *__P, __m128d __A) {
222
  _mm_store1_pd(__P, __A);
223
}
224
 
225
/* Store two DPFP values in reverse order.  The address must be aligned.  */
226
extern __inline void
227
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228
    _mm_storer_pd(double *__P, __m128d __A) {
229
  _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
230
}
231
 
232
/* Intel intrinsic.  */
233
extern __inline long long
234
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235
    _mm_cvtsi128_si64(__m128i __A) {
236
  return ((__v2di)__A)[0];
237
}
238
 
239
/* Microsoft intrinsic.  */
240
extern __inline long long
241
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
242
    _mm_cvtsi128_si64x(__m128i __A) {
243
  return ((__v2di)__A)[0];
244
}
245
 
246
extern __inline __m128d
247
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
248
    _mm_add_pd(__m128d __A, __m128d __B) {
249
  return (__m128d)((__v2df)__A + (__v2df)__B);
250
}
251
 
252
/* Add the lower double-precision (64-bit) floating-point element in
253
   a and b, store the result in the lower element of dst, and copy
254
   the upper element from a to the upper element of dst. */
255
extern __inline __m128d
256
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257
    _mm_add_sd(__m128d __A, __m128d __B) {
258
  __A[0] = __A[0] + __B[0];
259
  return (__A);
260
}
261
 
262
extern __inline __m128d
263
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
264
    _mm_sub_pd(__m128d __A, __m128d __B) {
265
  return (__m128d)((__v2df)__A - (__v2df)__B);
266
}
267
 
268
extern __inline __m128d
269
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270
    _mm_sub_sd(__m128d __A, __m128d __B) {
271
  __A[0] = __A[0] - __B[0];
272
  return (__A);
273
}
274
 
275
extern __inline __m128d
276
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
277
    _mm_mul_pd(__m128d __A, __m128d __B) {
278
  return (__m128d)((__v2df)__A * (__v2df)__B);
279
}
280
 
281
extern __inline __m128d
282
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283
    _mm_mul_sd(__m128d __A, __m128d __B) {
284
  __A[0] = __A[0] * __B[0];
285
  return (__A);
286
}
287
 
288
extern __inline __m128d
289
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290
    _mm_div_pd(__m128d __A, __m128d __B) {
291
  return (__m128d)((__v2df)__A / (__v2df)__B);
292
}
293
 
294
extern __inline __m128d
295
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296
    _mm_div_sd(__m128d __A, __m128d __B) {
297
  __A[0] = __A[0] / __B[0];
298
  return (__A);
299
}
300
 
301
extern __inline __m128d
302
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303
    _mm_sqrt_pd(__m128d __A) {
304
  return (vec_sqrt(__A));
305
}
306
 
307
/* Return pair {sqrt (B[0]), A[1]}.  */
308
extern __inline __m128d
309
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310
    _mm_sqrt_sd(__m128d __A, __m128d __B) {
311
  __v2df __c;
312
  __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
313
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
314
}
315
 
316
extern __inline __m128d
317
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318
    _mm_min_pd(__m128d __A, __m128d __B) {
319
  return (vec_min(__A, __B));
320
}
321
 
322
extern __inline __m128d
323
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
324
    _mm_min_sd(__m128d __A, __m128d __B) {
325
  __v2df __a, __b, __c;
326
  __a = vec_splats(__A[0]);
327
  __b = vec_splats(__B[0]);
328
  __c = vec_min(__a, __b);
329
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
330
}
331
 
332
extern __inline __m128d
333
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334
    _mm_max_pd(__m128d __A, __m128d __B) {
335
  return (vec_max(__A, __B));
336
}
337
 
338
extern __inline __m128d
339
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340
    _mm_max_sd(__m128d __A, __m128d __B) {
341
  __v2df __a, __b, __c;
342
  __a = vec_splats(__A[0]);
343
  __b = vec_splats(__B[0]);
344
  __c = vec_max(__a, __b);
345
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
346
}
347
 
348
extern __inline __m128d
349
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350
    _mm_cmpeq_pd(__m128d __A, __m128d __B) {
351
  return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
352
}
353
 
354
extern __inline __m128d
355
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356
    _mm_cmplt_pd(__m128d __A, __m128d __B) {
357
  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
358
}
359
 
360
extern __inline __m128d
361
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362
    _mm_cmple_pd(__m128d __A, __m128d __B) {
363
  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
364
}
365
 
366
extern __inline __m128d
367
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368
    _mm_cmpgt_pd(__m128d __A, __m128d __B) {
369
  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
370
}
371
 
372
extern __inline __m128d
373
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374
    _mm_cmpge_pd(__m128d __A, __m128d __B) {
375
  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
376
}
377
 
378
extern __inline __m128d
379
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380
    _mm_cmpneq_pd(__m128d __A, __m128d __B) {
381
  __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
382
  return ((__m128d)vec_nor(__temp, __temp));
383
}
384
 
385
extern __inline __m128d
386
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387
    _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
388
  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
389
}
390
 
391
extern __inline __m128d
392
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393
    _mm_cmpnle_pd(__m128d __A, __m128d __B) {
394
  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
395
}
396
 
397
extern __inline __m128d
398
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
399
    _mm_cmpngt_pd(__m128d __A, __m128d __B) {
400
  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
401
}
402
 
403
extern __inline __m128d
404
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
405
    _mm_cmpnge_pd(__m128d __A, __m128d __B) {
406
  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
407
}
408
 
409
extern __inline __m128d
410
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
411
    _mm_cmpord_pd(__m128d __A, __m128d __B) {
412
  __v2du __c, __d;
413
  /* Compare against self will return false (0's) if NAN.  */
414
  __c = (__v2du)vec_cmpeq(__A, __A);
415
  __d = (__v2du)vec_cmpeq(__B, __B);
416
  /* A != NAN and B != NAN.  */
417
  return ((__m128d)vec_and(__c, __d));
418
}
419
 
420
extern __inline __m128d
421
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
422
    _mm_cmpunord_pd(__m128d __A, __m128d __B) {
423
#if _ARCH_PWR8
424
  __v2du __c, __d;
425
  /* Compare against self will return false (0's) if NAN.  */
426
  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
427
  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
428
  /* A == NAN OR B == NAN converts too:
429
     NOT(A != NAN) OR NOT(B != NAN).  */
430
  __c = vec_nor(__c, __c);
431
  return ((__m128d)vec_orc(__c, __d));
432
#else
433
  __v2du __c, __d;
434
  /* Compare against self will return false (0's) if NAN.  */
435
  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
436
  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
437
  /* Convert the true ('1's) is NAN.  */
438
  __c = vec_nor(__c, __c);
439
  __d = vec_nor(__d, __d);
440
  return ((__m128d)vec_or(__c, __d));
441
#endif
442
}
443
 
444
extern __inline __m128d
445
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446
    _mm_cmpeq_sd(__m128d __A, __m128d __B) {
447
  __v2df __a, __b, __c;
448
  /* PowerISA VSX does not allow partial (for just lower double)
449
     results. So to insure we don't generate spurious exceptions
450
     (from the upper double values) we splat the lower double
451
     before we do the operation. */
452
  __a = vec_splats(__A[0]);
453
  __b = vec_splats(__B[0]);
454
  __c = (__v2df)vec_cmpeq(__a, __b);
455
  /* Then we merge the lower double result with the original upper
456
     double from __A.  */
457
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
458
}
459
 
460
extern __inline __m128d
461
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462
    _mm_cmplt_sd(__m128d __A, __m128d __B) {
463
  __v2df __a, __b, __c;
464
  __a = vec_splats(__A[0]);
465
  __b = vec_splats(__B[0]);
466
  __c = (__v2df)vec_cmplt(__a, __b);
467
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
468
}
469
 
470
extern __inline __m128d
471
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472
    _mm_cmple_sd(__m128d __A, __m128d __B) {
473
  __v2df __a, __b, __c;
474
  __a = vec_splats(__A[0]);
475
  __b = vec_splats(__B[0]);
476
  __c = (__v2df)vec_cmple(__a, __b);
477
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
478
}
479
 
480
extern __inline __m128d
481
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482
    _mm_cmpgt_sd(__m128d __A, __m128d __B) {
483
  __v2df __a, __b, __c;
484
  __a = vec_splats(__A[0]);
485
  __b = vec_splats(__B[0]);
486
  __c = (__v2df)vec_cmpgt(__a, __b);
487
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
488
}
489
 
490
extern __inline __m128d
491
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492
    _mm_cmpge_sd(__m128d __A, __m128d __B) {
493
  __v2df __a, __b, __c;
494
  __a = vec_splats(__A[0]);
495
  __b = vec_splats(__B[0]);
496
  __c = (__v2df)vec_cmpge(__a, __b);
497
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
498
}
499
 
500
extern __inline __m128d
501
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502
    _mm_cmpneq_sd(__m128d __A, __m128d __B) {
503
  __v2df __a, __b, __c;
504
  __a = vec_splats(__A[0]);
505
  __b = vec_splats(__B[0]);
506
  __c = (__v2df)vec_cmpeq(__a, __b);
507
  __c = vec_nor(__c, __c);
508
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
509
}
510
 
511
extern __inline __m128d
512
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513
    _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
514
  __v2df __a, __b, __c;
515
  __a = vec_splats(__A[0]);
516
  __b = vec_splats(__B[0]);
517
  /* Not less than is just greater than or equal.  */
518
  __c = (__v2df)vec_cmpge(__a, __b);
519
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
520
}
521
 
522
extern __inline __m128d
523
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524
    _mm_cmpnle_sd(__m128d __A, __m128d __B) {
525
  __v2df __a, __b, __c;
526
  __a = vec_splats(__A[0]);
527
  __b = vec_splats(__B[0]);
528
  /* Not less than or equal is just greater than.  */
529
  __c = (__v2df)vec_cmpge(__a, __b);
530
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
531
}
532
 
533
extern __inline __m128d
534
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535
    _mm_cmpngt_sd(__m128d __A, __m128d __B) {
536
  __v2df __a, __b, __c;
537
  __a = vec_splats(__A[0]);
538
  __b = vec_splats(__B[0]);
539
  /* Not greater than is just less than or equal.  */
540
  __c = (__v2df)vec_cmple(__a, __b);
541
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
542
}
543
 
544
extern __inline __m128d
545
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546
    _mm_cmpnge_sd(__m128d __A, __m128d __B) {
547
  __v2df __a, __b, __c;
548
  __a = vec_splats(__A[0]);
549
  __b = vec_splats(__B[0]);
550
  /* Not greater than or equal is just less than.  */
551
  __c = (__v2df)vec_cmplt(__a, __b);
552
  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
553
}
554
 
555
extern __inline __m128d
556
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
557
    _mm_cmpord_sd(__m128d __A, __m128d __B) {
558
  __v2df __r;
559
  __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
560
  return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
561
}
562
 
563
extern __inline __m128d
564
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565
    _mm_cmpunord_sd(__m128d __A, __m128d __B) {
566
  __v2df __r;
567
  __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
568
  return (__m128d)_mm_setr_pd(__r[0], __A[1]);
569
}
570
 
571
/* FIXME
572
   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
573
   exactly the same because GCC for PowerPC only generates unordered
574
   compares (scalar and vector).
575
   Technically __mm_comieq_sp et all should be using the ordered
576
   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
577
   be OK.   */
578
extern __inline int
579
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
580
    _mm_comieq_sd(__m128d __A, __m128d __B) {
581
  return (__A[0] == __B[0]);
582
}
583
 
584
extern __inline int
585
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
586
    _mm_comilt_sd(__m128d __A, __m128d __B) {
587
  return (__A[0] < __B[0]);
588
}
589
 
590
extern __inline int
591
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
592
    _mm_comile_sd(__m128d __A, __m128d __B) {
593
  return (__A[0] <= __B[0]);
594
}
595
 
596
extern __inline int
597
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598
    _mm_comigt_sd(__m128d __A, __m128d __B) {
599
  return (__A[0] > __B[0]);
600
}
601
 
602
extern __inline int
603
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604
    _mm_comige_sd(__m128d __A, __m128d __B) {
605
  return (__A[0] >= __B[0]);
606
}
607
 
608
extern __inline int
609
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610
    _mm_comineq_sd(__m128d __A, __m128d __B) {
611
  return (__A[0] != __B[0]);
612
}
613
 
614
extern __inline int
615
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616
    _mm_ucomieq_sd(__m128d __A, __m128d __B) {
617
  return (__A[0] == __B[0]);
618
}
619
 
620
extern __inline int
621
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622
    _mm_ucomilt_sd(__m128d __A, __m128d __B) {
623
  return (__A[0] < __B[0]);
624
}
625
 
626
extern __inline int
627
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
628
    _mm_ucomile_sd(__m128d __A, __m128d __B) {
629
  return (__A[0] <= __B[0]);
630
}
631
 
632
extern __inline int
633
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
634
    _mm_ucomigt_sd(__m128d __A, __m128d __B) {
635
  return (__A[0] > __B[0]);
636
}
637
 
638
extern __inline int
639
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640
    _mm_ucomige_sd(__m128d __A, __m128d __B) {
641
  return (__A[0] >= __B[0]);
642
}
643
 
644
extern __inline int
645
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646
    _mm_ucomineq_sd(__m128d __A, __m128d __B) {
647
  return (__A[0] != __B[0]);
648
}
649
 
650
/* Create a vector of Qi, where i is the element number.  */
651
extern __inline __m128i
652
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653
    _mm_set_epi64x(long long __q1, long long __q0) {
654
  return __extension__(__m128i)(__v2di){__q0, __q1};
655
}
656
 
657
extern __inline __m128i
658
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659
    _mm_set_epi64(__m64 __q1, __m64 __q0) {
660
  return _mm_set_epi64x((long long)__q1, (long long)__q0);
661
}
662
 
663
extern __inline __m128i
664
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
665
    _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
666
  return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
667
}
668
 
669
extern __inline __m128i
670
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
671
    _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
672
                  short __q2, short __q1, short __q0) {
673
  return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
674
                                        __q4, __q5, __q6, __q7};
675
}
676
 
677
extern __inline __m128i
678
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679
    _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
680
                 char __q10, char __q09, char __q08, char __q07, char __q06,
681
                 char __q05, char __q04, char __q03, char __q02, char __q01,
682
                 char __q00) {
683
  return __extension__(__m128i)(__v16qi){
684
      __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
685
      __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
686
}
687
 
688
/* Set all of the elements of the vector to A.  */
689
extern __inline __m128i
690
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
691
    _mm_set1_epi64x(long long __A) {
692
  return _mm_set_epi64x(__A, __A);
693
}
694
 
695
extern __inline __m128i
696
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
697
    _mm_set1_epi64(__m64 __A) {
698
  return _mm_set_epi64(__A, __A);
699
}
700
 
701
extern __inline __m128i
702
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703
    _mm_set1_epi32(int __A) {
704
  return _mm_set_epi32(__A, __A, __A, __A);
705
}
706
 
707
extern __inline __m128i
708
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709
    _mm_set1_epi16(short __A) {
710
  return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
711
}
712
 
713
extern __inline __m128i
714
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
715
    _mm_set1_epi8(char __A) {
716
  return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
717
                      __A, __A, __A, __A, __A);
718
}
719
 
720
/* Create a vector of Qi, where i is the element number.
721
   The parameter order is reversed from the _mm_set_epi* functions.  */
722
extern __inline __m128i
723
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
724
    _mm_setr_epi64(__m64 __q0, __m64 __q1) {
725
  return _mm_set_epi64(__q1, __q0);
726
}
727
 
728
extern __inline __m128i
729
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
730
    _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
731
  return _mm_set_epi32(__q3, __q2, __q1, __q0);
732
}
733
 
734
extern __inline __m128i
735
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736
    _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
737
                   short __q5, short __q6, short __q7) {
738
  return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
739
}
740
 
741
extern __inline __m128i
742
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743
    _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
744
                  char __q05, char __q06, char __q07, char __q08, char __q09,
745
                  char __q10, char __q11, char __q12, char __q13, char __q14,
746
                  char __q15) {
747
  return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
748
                      __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
749
}
750
 
751
/* Create a vector with element 0 as *P and the rest zero.  */
752
extern __inline __m128i
753
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754
    _mm_load_si128(__m128i const *__P) {
755
  return *__P;
756
}
757
 
758
extern __inline __m128i
759
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
760
    _mm_loadu_si128(__m128i_u const *__P) {
761
  return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
762
}
763
 
764
extern __inline __m128i
765
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
766
    _mm_loadl_epi64(__m128i_u const *__P) {
767
  return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
768
}
769
 
770
extern __inline void
771
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772
    _mm_store_si128(__m128i *__P, __m128i __B) {
773
  vec_st((__v16qu)__B, 0, (__v16qu *)__P);
774
}
775
 
776
extern __inline void
777
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778
    _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
779
  *__P = __B;
780
}
781
 
782
extern __inline void
783
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
784
    _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
785
  *(long long *)__P = ((__v2di)__B)[0];
786
}
787
 
788
extern __inline __m64
789
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
790
    _mm_movepi64_pi64(__m128i_u __B) {
791
  return (__m64)((__v2di)__B)[0];
792
}
793
 
794
extern __inline __m128i
795
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
796
    _mm_movpi64_epi64(__m64 __A) {
797
  return _mm_set_epi64((__m64)0LL, __A);
798
}
799
 
800
extern __inline __m128i
801
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
802
    _mm_move_epi64(__m128i __A) {
803
  return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
804
}
805
 
806
/* Create an undefined vector.  */
807
extern __inline __m128i
808
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809
    _mm_undefined_si128(void) {
810
  __m128i __Y = __Y;
811
  return __Y;
812
}
813
 
814
/* Create a vector of zeros.  */
815
extern __inline __m128i
816
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
817
    _mm_setzero_si128(void) {
818
  return __extension__(__m128i)(__v4si){0, 0, 0, 0};
819
}
820
 
821
#ifdef _ARCH_PWR8
822
extern __inline __m128d
823
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824
    _mm_cvtepi32_pd(__m128i __A) {
825
  __v2di __val;
826
  /* For LE need to generate Vector Unpack Low Signed Word.
827
     Which is generated from unpackh.  */
828
  __val = (__v2di)vec_unpackh((__v4si)__A);
829
 
830
  return (__m128d)vec_ctf(__val, 0);
831
}
832
#endif
833
 
834
extern __inline __m128
835
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836
    _mm_cvtepi32_ps(__m128i __A) {
837
  return ((__m128)vec_ctf((__v4si)__A, 0));
838
}
839
 
840
extern __inline __m128i
841
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842
    _mm_cvtpd_epi32(__m128d __A) {
843
  __v2df __rounded = vec_rint(__A);
844
  __v4si __result, __temp;
845
  const __v4si __vzero = {0, 0, 0, 0};
846
 
847
  /* VSX Vector truncate Double-Precision to integer and Convert to
848
   Signed Integer Word format with Saturate.  */
849
  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
850
 
851
#ifdef _ARCH_PWR8
852
#ifdef __LITTLE_ENDIAN__
853
  __temp = vec_mergeo(__temp, __temp);
854
#else
855
  __temp = vec_mergee(__temp, __temp);
856
#endif
857
  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
858
                                 (__vector long long)__vzero);
859
#else
860
  {
861
    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
862
                              0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
863
    __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
864
  }
865
#endif
866
  return (__m128i)__result;
867
}
868
 
869
extern __inline __m64
870
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871
    _mm_cvtpd_pi32(__m128d __A) {
872
  __m128i __result = _mm_cvtpd_epi32(__A);
873
 
874
  return (__m64)__result[0];
875
}
876
 
877
extern __inline __m128
878
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
879
    _mm_cvtpd_ps(__m128d __A) {
880
  __v4sf __result;
881
  __v4si __temp;
882
  const __v4si __vzero = {0, 0, 0, 0};
883
 
884
  __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
885
 
886
#ifdef _ARCH_PWR8
887
#ifdef __LITTLE_ENDIAN__
888
  __temp = vec_mergeo(__temp, __temp);
889
#else
890
  __temp = vec_mergee(__temp, __temp);
891
#endif
892
  __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
893
                                 (__vector long long)__vzero);
894
#else
895
  {
896
    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
897
                              0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
898
    __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
899
  }
900
#endif
901
  return ((__m128)__result);
902
}
903
 
904
extern __inline __m128i
905
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906
    _mm_cvttpd_epi32(__m128d __A) {
907
  __v4si __result;
908
  __v4si __temp;
909
  const __v4si __vzero = {0, 0, 0, 0};
910
 
911
  /* VSX Vector truncate Double-Precision to integer and Convert to
912
   Signed Integer Word format with Saturate.  */
913
  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
914
 
915
#ifdef _ARCH_PWR8
916
#ifdef __LITTLE_ENDIAN__
917
  __temp = vec_mergeo(__temp, __temp);
918
#else
919
  __temp = vec_mergee(__temp, __temp);
920
#endif
921
  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
922
                                 (__vector long long)__vzero);
923
#else
924
  {
925
    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
926
                              0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
927
    __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
928
  }
929
#endif
930
 
931
  return ((__m128i)__result);
932
}
933
 
934
extern __inline __m64
935
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
936
    _mm_cvttpd_pi32(__m128d __A) {
937
  __m128i __result = _mm_cvttpd_epi32(__A);
938
 
939
  return (__m64)__result[0];
940
}
941
 
942
extern __inline int
943
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
944
    _mm_cvtsi128_si32(__m128i __A) {
945
  return ((__v4si)__A)[0];
946
}
947
 
948
#ifdef _ARCH_PWR8
949
extern __inline __m128d
950
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951
    _mm_cvtpi32_pd(__m64 __A) {
952
  __v4si __temp;
953
  __v2di __tmp2;
954
  __v2df __result;
955
 
956
  __temp = (__v4si)vec_splats(__A);
957
  __tmp2 = (__v2di)vec_unpackl(__temp);
958
  __result = vec_ctf((__vector signed long long)__tmp2, 0);
959
  return (__m128d)__result;
960
}
961
#endif
962
 
963
extern __inline __m128i
964
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
965
    _mm_cvtps_epi32(__m128 __A) {
966
  __v4sf __rounded;
967
  __v4si __result;
968
 
969
  __rounded = vec_rint((__v4sf)__A);
970
  __result = vec_cts(__rounded, 0);
971
  return (__m128i)__result;
972
}
973
 
974
extern __inline __m128i
975
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976
    _mm_cvttps_epi32(__m128 __A) {
977
  __v4si __result;
978
 
979
  __result = vec_cts((__v4sf)__A, 0);
980
  return (__m128i)__result;
981
}
982
 
983
extern __inline __m128d
984
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985
    _mm_cvtps_pd(__m128 __A) {
986
  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
987
#ifdef vec_doubleh
988
  return (__m128d)vec_doubleh((__v4sf)__A);
989
#else
990
  /* Otherwise the compiler is not current and so need to generate the
991
     equivalent code.  */
992
  __v4sf __a = (__v4sf)__A;
993
  __v4sf __temp;
994
  __v2df __result;
995
#ifdef __LITTLE_ENDIAN__
996
  /* The input float values are in elements {[0], [1]} but the convert
997
     instruction needs them in elements {[1], [3]}, So we use two
998
     shift left double vector word immediates to get the elements
999
     lined up.  */
1000
  __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1001
  __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1002
#else
1003
  /* The input float values are in elements {[0], [1]} but the convert
1004
     instruction needs them in elements {[0], [2]}, So we use two
1005
     shift left double vector word immediates to get the elements
1006
     lined up.  */
1007
  __temp = vec_vmrghw(__a, __a);
1008
#endif
1009
  __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1010
  return (__m128d)__result;
1011
#endif
1012
}
1013
 
1014
extern __inline int
1015
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016
    _mm_cvtsd_si32(__m128d __A) {
1017
  __v2df __rounded = vec_rint((__v2df)__A);
1018
  int __result = ((__v2df)__rounded)[0];
1019
 
1020
  return __result;
1021
}
1022
/* Intel intrinsic.  */
1023
extern __inline long long
1024
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025
    _mm_cvtsd_si64(__m128d __A) {
1026
  __v2df __rounded = vec_rint((__v2df)__A);
1027
  long long __result = ((__v2df)__rounded)[0];
1028
 
1029
  return __result;
1030
}
1031
 
1032
/* Microsoft intrinsic.  */
1033
extern __inline long long
1034
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1035
    _mm_cvtsd_si64x(__m128d __A) {
1036
  return _mm_cvtsd_si64((__v2df)__A);
1037
}
1038
 
1039
extern __inline int
1040
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041
    _mm_cvttsd_si32(__m128d __A) {
1042
  int __result = ((__v2df)__A)[0];
1043
 
1044
  return __result;
1045
}
1046
 
1047
/* Intel intrinsic.  */
1048
extern __inline long long
1049
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050
    _mm_cvttsd_si64(__m128d __A) {
1051
  long long __result = ((__v2df)__A)[0];
1052
 
1053
  return __result;
1054
}
1055
 
1056
/* Microsoft intrinsic.  */
1057
extern __inline long long
1058
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059
    _mm_cvttsd_si64x(__m128d __A) {
1060
  return _mm_cvttsd_si64(__A);
1061
}
1062
 
1063
extern __inline __m128
1064
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065
    _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1066
  __v4sf __result = (__v4sf)__A;
1067
 
1068
#ifdef __LITTLE_ENDIAN__
1069
  __v4sf __temp_s;
1070
  /* Copy double element[0] to element [1] for conversion.  */
1071
  __v2df __temp_b = vec_splat((__v2df)__B, 0);
1072
 
1073
  /* Pre-rotate __A left 3 (logically right 1) elements.  */
1074
  __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1075
  /* Convert double to single float scalar in a vector.  */
1076
  __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1077
  /* Shift the resulting scalar into vector element [0].  */
1078
  __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1079
#else
1080
  __result[0] = ((__v2df)__B)[0];
1081
#endif
1082
  return (__m128)__result;
1083
}
1084
 
1085
extern __inline __m128d
1086
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1087
    _mm_cvtsi32_sd(__m128d __A, int __B) {
1088
  __v2df __result = (__v2df)__A;
1089
  double __db = __B;
1090
  __result[0] = __db;
1091
  return (__m128d)__result;
1092
}
1093
 
1094
/* Intel intrinsic.  */
1095
extern __inline __m128d
1096
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097
    _mm_cvtsi64_sd(__m128d __A, long long __B) {
1098
  __v2df __result = (__v2df)__A;
1099
  double __db = __B;
1100
  __result[0] = __db;
1101
  return (__m128d)__result;
1102
}
1103
 
1104
/* Microsoft intrinsic.  */
1105
extern __inline __m128d
1106
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1107
    _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1108
  return _mm_cvtsi64_sd(__A, __B);
1109
}
1110
 
1111
extern __inline __m128d
1112
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1113
    _mm_cvtss_sd(__m128d __A, __m128 __B) {
1114
#ifdef __LITTLE_ENDIAN__
1115
  /* Use splat to move element [0] into position for the convert. */
1116
  __v4sf __temp = vec_splat((__v4sf)__B, 0);
1117
  __v2df __res;
1118
  /* Convert single float scalar to double in a vector.  */
1119
  __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1120
  return (__m128d)vec_mergel(__res, (__v2df)__A);
1121
#else
1122
  __v2df __res = (__v2df)__A;
1123
  __res[0] = ((__v4sf)__B)[0];
1124
  return (__m128d)__res;
1125
#endif
1126
}
1127
 
1128
extern __inline __m128d
1129
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1130
    _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1131
  __vector double __result;
1132
  const int __litmsk = __mask & 0x3;
1133
 
1134
  if (__litmsk == 0)
1135
    __result = vec_mergeh(__A, __B);
1136
#if __GNUC__ < 6
1137
  else if (__litmsk == 1)
1138
    __result = vec_xxpermdi(__B, __A, 2);
1139
  else if (__litmsk == 2)
1140
    __result = vec_xxpermdi(__B, __A, 1);
1141
#else
1142
  else if (__litmsk == 1)
1143
    __result = vec_xxpermdi(__A, __B, 2);
1144
  else if (__litmsk == 2)
1145
    __result = vec_xxpermdi(__A, __B, 1);
1146
#endif
1147
  else
1148
    __result = vec_mergel(__A, __B);
1149
 
1150
  return __result;
1151
}
1152
 
1153
extern __inline __m128d
1154
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1155
    _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1156
  return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1157
}
1158
 
1159
extern __inline __m128d
1160
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161
    _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1162
  return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1163
}
1164
 
1165
extern __inline __m128d
1166
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167
    _mm_loadh_pd(__m128d __A, double const *__B) {
1168
  __v2df __result = (__v2df)__A;
1169
  __result[1] = *__B;
1170
  return (__m128d)__result;
1171
}
1172
 
1173
extern __inline __m128d
1174
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175
    _mm_loadl_pd(__m128d __A, double const *__B) {
1176
  __v2df __result = (__v2df)__A;
1177
  __result[0] = *__B;
1178
  return (__m128d)__result;
1179
}
1180
 
1181
#ifdef _ARCH_PWR8
1182
/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1183
 
1184
/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1185
extern __inline int
1186
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1187
    _mm_movemask_pd(__m128d __A) {
1188
#ifdef _ARCH_PWR10
1189
  return vec_extractm((__v2du)__A);
1190
#else
1191
  __vector unsigned long long __result;
1192
  static const __vector unsigned int __perm_mask = {
1193
#ifdef __LITTLE_ENDIAN__
1194
      0x80800040, 0x80808080, 0x80808080, 0x80808080
1195
#else
1196
      0x80808080, 0x80808080, 0x80808080, 0x80804000
1197
#endif
1198
  };
1199
 
1200
  __result = ((__vector unsigned long long)vec_vbpermq(
1201
      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1202
 
1203
#ifdef __LITTLE_ENDIAN__
1204
  return __result[1];
1205
#else
1206
  return __result[0];
1207
#endif
1208
#endif /* !_ARCH_PWR10 */
1209
}
1210
#endif /* _ARCH_PWR8 */
1211
 
1212
extern __inline __m128i
1213
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214
    _mm_packs_epi16(__m128i __A, __m128i __B) {
1215
  return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1216
}
1217
 
1218
extern __inline __m128i
1219
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1220
    _mm_packs_epi32(__m128i __A, __m128i __B) {
1221
  return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1222
}
1223
 
1224
extern __inline __m128i
1225
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226
    _mm_packus_epi16(__m128i __A, __m128i __B) {
1227
  return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1228
}
1229
 
1230
extern __inline __m128i
1231
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1232
    _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1233
  return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1234
}
1235
 
1236
extern __inline __m128i
1237
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238
    _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1239
  return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1240
}
1241
 
1242
extern __inline __m128i
1243
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244
    _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1245
  return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1246
}
1247
 
1248
extern __inline __m128i
1249
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1250
    _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1251
  return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1252
}
1253
 
1254
extern __inline __m128i
1255
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1256
    _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1257
  return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1258
}
1259
 
1260
extern __inline __m128i
1261
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262
    _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1263
  return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1264
}
1265
 
1266
extern __inline __m128i
1267
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268
    _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1269
  return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1270
}
1271
 
1272
extern __inline __m128i
1273
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1274
    _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1275
  return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1276
}
1277
 
1278
extern __inline __m128i
1279
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280
    _mm_add_epi8(__m128i __A, __m128i __B) {
1281
  return (__m128i)((__v16qu)__A + (__v16qu)__B);
1282
}
1283
 
1284
extern __inline __m128i
1285
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286
    _mm_add_epi16(__m128i __A, __m128i __B) {
1287
  return (__m128i)((__v8hu)__A + (__v8hu)__B);
1288
}
1289
 
1290
extern __inline __m128i
1291
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1292
    _mm_add_epi32(__m128i __A, __m128i __B) {
1293
  return (__m128i)((__v4su)__A + (__v4su)__B);
1294
}
1295
 
1296
extern __inline __m128i
1297
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298
    _mm_add_epi64(__m128i __A, __m128i __B) {
1299
  return (__m128i)((__v2du)__A + (__v2du)__B);
1300
}
1301
 
1302
extern __inline __m128i
1303
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304
    _mm_adds_epi8(__m128i __A, __m128i __B) {
1305
  return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1306
}
1307
 
1308
extern __inline __m128i
1309
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310
    _mm_adds_epi16(__m128i __A, __m128i __B) {
1311
  return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1312
}
1313
 
1314
extern __inline __m128i
1315
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1316
    _mm_adds_epu8(__m128i __A, __m128i __B) {
1317
  return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1318
}
1319
 
1320
extern __inline __m128i
1321
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322
    _mm_adds_epu16(__m128i __A, __m128i __B) {
1323
  return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1324
}
1325
 
1326
extern __inline __m128i
1327
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328
    _mm_sub_epi8(__m128i __A, __m128i __B) {
1329
  return (__m128i)((__v16qu)__A - (__v16qu)__B);
1330
}
1331
 
1332
extern __inline __m128i
1333
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1334
    _mm_sub_epi16(__m128i __A, __m128i __B) {
1335
  return (__m128i)((__v8hu)__A - (__v8hu)__B);
1336
}
1337
 
1338
extern __inline __m128i
1339
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340
    _mm_sub_epi32(__m128i __A, __m128i __B) {
1341
  return (__m128i)((__v4su)__A - (__v4su)__B);
1342
}
1343
 
1344
extern __inline __m128i
1345
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346
    _mm_sub_epi64(__m128i __A, __m128i __B) {
1347
  return (__m128i)((__v2du)__A - (__v2du)__B);
1348
}
1349
 
1350
extern __inline __m128i
1351
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352
    _mm_subs_epi8(__m128i __A, __m128i __B) {
1353
  return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1354
}
1355
 
1356
extern __inline __m128i
1357
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358
    _mm_subs_epi16(__m128i __A, __m128i __B) {
1359
  return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1360
}
1361
 
1362
extern __inline __m128i
1363
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364
    _mm_subs_epu8(__m128i __A, __m128i __B) {
1365
  return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1366
}
1367
 
1368
extern __inline __m128i
1369
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370
    _mm_subs_epu16(__m128i __A, __m128i __B) {
1371
  return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1372
}
1373
 
1374
extern __inline __m128i
1375
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376
    _mm_madd_epi16(__m128i __A, __m128i __B) {
1377
  __vector signed int __zero = {0, 0, 0, 0};
1378
 
1379
  return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1380
}
1381
 
1382
extern __inline __m128i
1383
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384
    _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1385
  __vector signed int __w0, __w1;
1386
 
1387
  __vector unsigned char __xform1 = {
1388
#ifdef __LITTLE_ENDIAN__
1389
      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1390
      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1391
#else
1392
      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1393
      0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1394
#endif
1395
  };
1396
 
1397
  __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1398
  __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1399
  return (__m128i)vec_perm(__w0, __w1, __xform1);
1400
}
1401
 
1402
extern __inline __m128i
1403
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1404
    _mm_mullo_epi16(__m128i __A, __m128i __B) {
1405
  return (__m128i)((__v8hi)__A * (__v8hi)__B);
1406
}
1407
 
1408
extern __inline __m64
1409
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1410
    _mm_mul_su32(__m64 __A, __m64 __B) {
1411
  unsigned int __a = __A;
1412
  unsigned int __b = __B;
1413
 
1414
  return ((__m64)__a * (__m64)__b);
1415
}
1416
 
1417
#ifdef _ARCH_PWR8
1418
extern __inline __m128i
1419
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1420
    _mm_mul_epu32(__m128i __A, __m128i __B) {
1421
#if __GNUC__ < 8
1422
  __v2du __result;
1423
 
1424
#ifdef __LITTLE_ENDIAN__
1425
  /* VMX Vector Multiply Odd Unsigned Word.  */
1426
  __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1427
#else
1428
  /* VMX Vector Multiply Even Unsigned Word.  */
1429
  __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1430
#endif
1431
  return (__m128i)__result;
1432
#else
1433
  return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1434
#endif
1435
}
1436
#endif
1437
 
1438
extern __inline __m128i
1439
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1440
    _mm_slli_epi16(__m128i __A, int __B) {
1441
  __v8hu __lshift;
1442
  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1443
 
1444
  if (__B >= 0 && __B < 16) {
1445
    if (__builtin_constant_p(__B))
1446
      __lshift = (__v8hu)vec_splat_s16(__B);
1447
    else
1448
      __lshift = vec_splats((unsigned short)__B);
1449
 
1450
    __result = vec_sl((__v8hi)__A, __lshift);
1451
  }
1452
 
1453
  return (__m128i)__result;
1454
}
1455
 
1456
extern __inline __m128i
1457
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458
    _mm_slli_epi32(__m128i __A, int __B) {
1459
  __v4su __lshift;
1460
  __v4si __result = {0, 0, 0, 0};
1461
 
1462
  if (__B >= 0 && __B < 32) {
1463
    if (__builtin_constant_p(__B) && __B < 16)
1464
      __lshift = (__v4su)vec_splat_s32(__B);
1465
    else
1466
      __lshift = vec_splats((unsigned int)__B);
1467
 
1468
    __result = vec_sl((__v4si)__A, __lshift);
1469
  }
1470
 
1471
  return (__m128i)__result;
1472
}
1473
 
1474
#ifdef _ARCH_PWR8
1475
extern __inline __m128i
1476
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1477
    _mm_slli_epi64(__m128i __A, int __B) {
1478
  __v2du __lshift;
1479
  __v2di __result = {0, 0};
1480
 
1481
  if (__B >= 0 && __B < 64) {
1482
    if (__builtin_constant_p(__B) && __B < 16)
1483
      __lshift = (__v2du)vec_splat_s32(__B);
1484
    else
1485
      __lshift = (__v2du)vec_splats((unsigned int)__B);
1486
 
1487
    __result = vec_sl((__v2di)__A, __lshift);
1488
  }
1489
 
1490
  return (__m128i)__result;
1491
}
1492
#endif
1493
 
1494
extern __inline __m128i
1495
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496
    _mm_srai_epi16(__m128i __A, int __B) {
1497
  __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1498
  __v8hi __result;
1499
 
1500
  if (__B < 16) {
1501
    if (__builtin_constant_p(__B))
1502
      __rshift = (__v8hu)vec_splat_s16(__B);
1503
    else
1504
      __rshift = vec_splats((unsigned short)__B);
1505
  }
1506
  __result = vec_sra((__v8hi)__A, __rshift);
1507
 
1508
  return (__m128i)__result;
1509
}
1510
 
1511
extern __inline __m128i
1512
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1513
    _mm_srai_epi32(__m128i __A, int __B) {
1514
  __v4su __rshift = {31, 31, 31, 31};
1515
  __v4si __result;
1516
 
1517
  if (__B < 32) {
1518
    if (__builtin_constant_p(__B)) {
1519
      if (__B < 16)
1520
        __rshift = (__v4su)vec_splat_s32(__B);
1521
      else
1522
        __rshift = (__v4su)vec_splats((unsigned int)__B);
1523
    } else
1524
      __rshift = vec_splats((unsigned int)__B);
1525
  }
1526
  __result = vec_sra((__v4si)__A, __rshift);
1527
 
1528
  return (__m128i)__result;
1529
}
1530
 
1531
extern __inline __m128i
1532
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1533
    _mm_bslli_si128(__m128i __A, const int __N) {
1534
  __v16qu __result;
1535
  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1536
 
1537
  if (__N < 16)
1538
    __result = vec_sld((__v16qu)__A, __zeros, __N);
1539
  else
1540
    __result = __zeros;
1541
 
1542
  return (__m128i)__result;
1543
}
1544
 
1545
extern __inline __m128i
1546
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547
    _mm_bsrli_si128(__m128i __A, const int __N) {
1548
  __v16qu __result;
1549
  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1550
 
1551
  if (__N < 16)
1552
#ifdef __LITTLE_ENDIAN__
1553
    if (__builtin_constant_p(__N))
1554
      /* Would like to use Vector Shift Left Double by Octet
1555
         Immediate here to use the immediate form and avoid
1556
         load of __N * 8 value into a separate VR.  */
1557
      __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1558
    else
1559
#endif
1560
    {
1561
      __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1562
#ifdef __LITTLE_ENDIAN__
1563
      __result = vec_sro((__v16qu)__A, __shift);
1564
#else
1565
    __result = vec_slo((__v16qu)__A, __shift);
1566
#endif
1567
    }
1568
  else
1569
    __result = __zeros;
1570
 
1571
  return (__m128i)__result;
1572
}
1573
 
1574
extern __inline __m128i
1575
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1576
    _mm_srli_si128(__m128i __A, const int __N) {
1577
  return _mm_bsrli_si128(__A, __N);
1578
}
1579
 
1580
extern __inline __m128i
1581
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1582
    _mm_slli_si128(__m128i __A, const int _imm5) {
1583
  __v16qu __result;
1584
  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1585
 
1586
  if (_imm5 < 16)
1587
#ifdef __LITTLE_ENDIAN__
1588
    __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1589
#else
1590
    __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1591
#endif
1592
  else
1593
    __result = __zeros;
1594
 
1595
  return (__m128i)__result;
1596
}
1597
 
1598
extern __inline __m128i
1599
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600
 
1601
    _mm_srli_epi16(__m128i __A, int __B) {
1602
  __v8hu __rshift;
1603
  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1604
 
1605
  if (__B < 16) {
1606
    if (__builtin_constant_p(__B))
1607
      __rshift = (__v8hu)vec_splat_s16(__B);
1608
    else
1609
      __rshift = vec_splats((unsigned short)__B);
1610
 
1611
    __result = vec_sr((__v8hi)__A, __rshift);
1612
  }
1613
 
1614
  return (__m128i)__result;
1615
}
1616
 
1617
extern __inline __m128i
1618
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1619
    _mm_srli_epi32(__m128i __A, int __B) {
1620
  __v4su __rshift;
1621
  __v4si __result = {0, 0, 0, 0};
1622
 
1623
  if (__B < 32) {
1624
    if (__builtin_constant_p(__B)) {
1625
      if (__B < 16)
1626
        __rshift = (__v4su)vec_splat_s32(__B);
1627
      else
1628
        __rshift = (__v4su)vec_splats((unsigned int)__B);
1629
    } else
1630
      __rshift = vec_splats((unsigned int)__B);
1631
 
1632
    __result = vec_sr((__v4si)__A, __rshift);
1633
  }
1634
 
1635
  return (__m128i)__result;
1636
}
1637
 
1638
#ifdef _ARCH_PWR8
1639
extern __inline __m128i
1640
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1641
    _mm_srli_epi64(__m128i __A, int __B) {
1642
  __v2du __rshift;
1643
  __v2di __result = {0, 0};
1644
 
1645
  if (__B < 64) {
1646
    if (__builtin_constant_p(__B)) {
1647
      if (__B < 16)
1648
        __rshift = (__v2du)vec_splat_s32(__B);
1649
      else
1650
        __rshift = (__v2du)vec_splats((unsigned long long)__B);
1651
    } else
1652
      __rshift = (__v2du)vec_splats((unsigned int)__B);
1653
 
1654
    __result = vec_sr((__v2di)__A, __rshift);
1655
  }
1656
 
1657
  return (__m128i)__result;
1658
}
1659
#endif
1660
 
1661
extern __inline __m128i
1662
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1663
    _mm_sll_epi16(__m128i __A, __m128i __B) {
1664
  __v8hu __lshift;
1665
  __vector __bool short __shmask;
1666
  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1667
  __v8hu __result;
1668
 
1669
#ifdef __LITTLE_ENDIAN__
1670
  __lshift = vec_splat((__v8hu)__B, 0);
1671
#else
1672
  __lshift = vec_splat((__v8hu)__B, 3);
1673
#endif
1674
  __shmask = vec_cmple(__lshift, __shmax);
1675
  __result = vec_sl((__v8hu)__A, __lshift);
1676
  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1677
 
1678
  return (__m128i)__result;
1679
}
1680
 
1681
extern __inline __m128i
1682
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683
    _mm_sll_epi32(__m128i __A, __m128i __B) {
1684
  __v4su __lshift;
1685
  __vector __bool int __shmask;
1686
  const __v4su __shmax = {32, 32, 32, 32};
1687
  __v4su __result;
1688
#ifdef __LITTLE_ENDIAN__
1689
  __lshift = vec_splat((__v4su)__B, 0);
1690
#else
1691
  __lshift = vec_splat((__v4su)__B, 1);
1692
#endif
1693
  __shmask = vec_cmplt(__lshift, __shmax);
1694
  __result = vec_sl((__v4su)__A, __lshift);
1695
  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1696
 
1697
  return (__m128i)__result;
1698
}
1699
 
1700
#ifdef _ARCH_PWR8
1701
extern __inline __m128i
1702
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1703
    _mm_sll_epi64(__m128i __A, __m128i __B) {
1704
  __v2du __lshift;
1705
  __vector __bool long long __shmask;
1706
  const __v2du __shmax = {64, 64};
1707
  __v2du __result;
1708
 
1709
  __lshift = vec_splat((__v2du)__B, 0);
1710
  __shmask = vec_cmplt(__lshift, __shmax);
1711
  __result = vec_sl((__v2du)__A, __lshift);
1712
  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1713
 
1714
  return (__m128i)__result;
1715
}
1716
#endif
1717
 
1718
extern __inline __m128i
1719
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720
    _mm_sra_epi16(__m128i __A, __m128i __B) {
1721
  const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1722
  __v8hu __rshift;
1723
  __v8hi __result;
1724
 
1725
#ifdef __LITTLE_ENDIAN__
1726
  __rshift = vec_splat((__v8hu)__B, 0);
1727
#else
1728
  __rshift = vec_splat((__v8hu)__B, 3);
1729
#endif
1730
  __rshift = vec_min(__rshift, __rshmax);
1731
  __result = vec_sra((__v8hi)__A, __rshift);
1732
 
1733
  return (__m128i)__result;
1734
}
1735
 
1736
extern __inline __m128i
1737
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738
    _mm_sra_epi32(__m128i __A, __m128i __B) {
1739
  const __v4su __rshmax = {31, 31, 31, 31};
1740
  __v4su __rshift;
1741
  __v4si __result;
1742
 
1743
#ifdef __LITTLE_ENDIAN__
1744
  __rshift = vec_splat((__v4su)__B, 0);
1745
#else
1746
  __rshift = vec_splat((__v4su)__B, 1);
1747
#endif
1748
  __rshift = vec_min(__rshift, __rshmax);
1749
  __result = vec_sra((__v4si)__A, __rshift);
1750
 
1751
  return (__m128i)__result;
1752
}
1753
 
1754
extern __inline __m128i
1755
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756
    _mm_srl_epi16(__m128i __A, __m128i __B) {
1757
  __v8hu __rshift;
1758
  __vector __bool short __shmask;
1759
  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1760
  __v8hu __result;
1761
 
1762
#ifdef __LITTLE_ENDIAN__
1763
  __rshift = vec_splat((__v8hu)__B, 0);
1764
#else
1765
  __rshift = vec_splat((__v8hu)__B, 3);
1766
#endif
1767
  __shmask = vec_cmple(__rshift, __shmax);
1768
  __result = vec_sr((__v8hu)__A, __rshift);
1769
  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1770
 
1771
  return (__m128i)__result;
1772
}
1773
 
1774
extern __inline __m128i
1775
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1776
    _mm_srl_epi32(__m128i __A, __m128i __B) {
1777
  __v4su __rshift;
1778
  __vector __bool int __shmask;
1779
  const __v4su __shmax = {32, 32, 32, 32};
1780
  __v4su __result;
1781
 
1782
#ifdef __LITTLE_ENDIAN__
1783
  __rshift = vec_splat((__v4su)__B, 0);
1784
#else
1785
  __rshift = vec_splat((__v4su)__B, 1);
1786
#endif
1787
  __shmask = vec_cmplt(__rshift, __shmax);
1788
  __result = vec_sr((__v4su)__A, __rshift);
1789
  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1790
 
1791
  return (__m128i)__result;
1792
}
1793
 
1794
#ifdef _ARCH_PWR8
1795
extern __inline __m128i
1796
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1797
    _mm_srl_epi64(__m128i __A, __m128i __B) {
1798
  __v2du __rshift;
1799
  __vector __bool long long __shmask;
1800
  const __v2du __shmax = {64, 64};
1801
  __v2du __result;
1802
 
1803
  __rshift = vec_splat((__v2du)__B, 0);
1804
  __shmask = vec_cmplt(__rshift, __shmax);
1805
  __result = vec_sr((__v2du)__A, __rshift);
1806
  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1807
 
1808
  return (__m128i)__result;
1809
}
1810
#endif
1811
 
1812
extern __inline __m128d
1813
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1814
    _mm_and_pd(__m128d __A, __m128d __B) {
1815
  return (vec_and((__v2df)__A, (__v2df)__B));
1816
}
1817
 
1818
extern __inline __m128d
1819
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1820
    _mm_andnot_pd(__m128d __A, __m128d __B) {
1821
  return (vec_andc((__v2df)__B, (__v2df)__A));
1822
}
1823
 
1824
extern __inline __m128d
1825
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1826
    _mm_or_pd(__m128d __A, __m128d __B) {
1827
  return (vec_or((__v2df)__A, (__v2df)__B));
1828
}
1829
 
1830
extern __inline __m128d
1831
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1832
    _mm_xor_pd(__m128d __A, __m128d __B) {
1833
  return (vec_xor((__v2df)__A, (__v2df)__B));
1834
}
1835
 
1836
extern __inline __m128i
1837
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1838
    _mm_and_si128(__m128i __A, __m128i __B) {
1839
  return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1840
}
1841
 
1842
extern __inline __m128i
1843
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1844
    _mm_andnot_si128(__m128i __A, __m128i __B) {
1845
  return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1846
}
1847
 
1848
extern __inline __m128i
1849
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1850
    _mm_or_si128(__m128i __A, __m128i __B) {
1851
  return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1852
}
1853
 
1854
extern __inline __m128i
1855
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1856
    _mm_xor_si128(__m128i __A, __m128i __B) {
1857
  return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1858
}
1859
 
1860
extern __inline __m128i
1861
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1862
    _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1863
  return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1864
}
1865
 
1866
extern __inline __m128i
1867
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1868
    _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1869
  return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1870
}
1871
 
1872
extern __inline __m128i
1873
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1874
    _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1875
  return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1876
}
1877
 
1878
extern __inline __m128i
1879
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1880
    _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1881
  return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1882
}
1883
 
1884
extern __inline __m128i
1885
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1886
    _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1887
  return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1888
}
1889
 
1890
extern __inline __m128i
1891
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1892
    _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1893
  return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1894
}
1895
 
1896
extern __inline __m128i
1897
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1898
    _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1899
  return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1900
}
1901
 
1902
extern __inline __m128i
1903
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1904
    _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1905
  return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1906
}
1907
 
1908
extern __inline __m128i
1909
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1910
    _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1911
  return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1912
}
1913
 
1914
extern __inline int
1915
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1916
    _mm_extract_epi16(__m128i const __A, int const __N) {
1917
  return (unsigned short)((__v8hi)__A)[__N & 7];
1918
}
1919
 
1920
extern __inline __m128i
1921
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1922
    _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1923
  __v8hi __result = (__v8hi)__A;
1924
 
1925
  __result[(__N & 7)] = __D;
1926
 
1927
  return (__m128i)__result;
1928
}
1929
 
1930
extern __inline __m128i
1931
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1932
    _mm_max_epi16(__m128i __A, __m128i __B) {
1933
  return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1934
}
1935
 
1936
extern __inline __m128i
1937
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1938
    _mm_max_epu8(__m128i __A, __m128i __B) {
1939
  return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1940
}
1941
 
1942
extern __inline __m128i
1943
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1944
    _mm_min_epi16(__m128i __A, __m128i __B) {
1945
  return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1946
}
1947
 
1948
extern __inline __m128i
1949
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1950
    _mm_min_epu8(__m128i __A, __m128i __B) {
1951
  return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1952
}
1953
 
1954
#ifdef _ARCH_PWR8
1955
/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1956
 
1957
/* Return a mask created from the most significant bit of each 8-bit
1958
   element in A.  */
1959
extern __inline int
1960
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1961
    _mm_movemask_epi8(__m128i __A) {
1962
#ifdef _ARCH_PWR10
1963
  return vec_extractm((__v16qu)__A);
1964
#else
1965
  __vector unsigned long long __result;
1966
  static const __vector unsigned char __perm_mask = {
1967
      0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1968
      0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1969
 
1970
  __result = ((__vector unsigned long long)vec_vbpermq(
1971
      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1972
 
1973
#ifdef __LITTLE_ENDIAN__
1974
  return __result[1];
1975
#else
1976
  return __result[0];
1977
#endif
1978
#endif /* !_ARCH_PWR10 */
1979
}
1980
#endif /* _ARCH_PWR8 */
1981
 
1982
extern __inline __m128i
1983
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1984
    _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1985
  __v4su __w0, __w1;
1986
  __v16qu __xform1 = {
1987
#ifdef __LITTLE_ENDIAN__
1988
      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1989
      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1990
#else
1991
      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1992
      0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1993
#endif
1994
  };
1995
 
1996
  __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1997
  __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1998
  return (__m128i)vec_perm(__w0, __w1, __xform1);
1999
}
2000
 
2001
extern __inline __m128i
2002
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2003
    _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2004
  unsigned long __element_selector_98 = __mask & 0x03;
2005
  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2006
  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2007
  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2008
  static const unsigned short __permute_selectors[4] = {
2009
#ifdef __LITTLE_ENDIAN__
2010
      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2011
#else
2012
      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2013
#endif
2014
  };
2015
  __v2du __pmask =
2016
#ifdef __LITTLE_ENDIAN__
2017
      {0x1716151413121110UL, 0UL};
2018
#else
2019
      {0x1011121314151617UL, 0UL};
2020
#endif
2021
  __m64_union __t;
2022
  __v2du __a, __r;
2023
 
2024
  __t.as_short[0] = __permute_selectors[__element_selector_98];
2025
  __t.as_short[1] = __permute_selectors[__element_selector_BA];
2026
  __t.as_short[2] = __permute_selectors[__element_selector_DC];
2027
  __t.as_short[3] = __permute_selectors[__element_selector_FE];
2028
  __pmask[1] = __t.as_m64;
2029
  __a = (__v2du)__A;
2030
  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2031
  return (__m128i)__r;
2032
}
2033
 
2034
extern __inline __m128i
2035
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2036
    _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2037
  unsigned long __element_selector_10 = __mask & 0x03;
2038
  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2039
  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2040
  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2041
  static const unsigned short __permute_selectors[4] = {
2042
#ifdef __LITTLE_ENDIAN__
2043
      0x0100, 0x0302, 0x0504, 0x0706
2044
#else
2045
      0x0001, 0x0203, 0x0405, 0x0607
2046
#endif
2047
  };
2048
  __v2du __pmask =
2049
#ifdef __LITTLE_ENDIAN__
2050
      {0UL, 0x1f1e1d1c1b1a1918UL};
2051
#else
2052
      {0UL, 0x18191a1b1c1d1e1fUL};
2053
#endif
2054
  __m64_union __t;
2055
  __v2du __a, __r;
2056
  __t.as_short[0] = __permute_selectors[__element_selector_10];
2057
  __t.as_short[1] = __permute_selectors[__element_selector_32];
2058
  __t.as_short[2] = __permute_selectors[__element_selector_54];
2059
  __t.as_short[3] = __permute_selectors[__element_selector_76];
2060
  __pmask[0] = __t.as_m64;
2061
  __a = (__v2du)__A;
2062
  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2063
  return (__m128i)__r;
2064
}
2065
 
2066
extern __inline __m128i
2067
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2068
    _mm_shuffle_epi32(__m128i __A, const int __mask) {
2069
  unsigned long __element_selector_10 = __mask & 0x03;
2070
  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2071
  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2072
  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2073
  static const unsigned int __permute_selectors[4] = {
2074
#ifdef __LITTLE_ENDIAN__
2075
      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2076
#else
2077
      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2078
#endif
2079
  };
2080
  __v4su __t;
2081
 
2082
  __t[0] = __permute_selectors[__element_selector_10];
2083
  __t[1] = __permute_selectors[__element_selector_32];
2084
  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2085
  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2086
  return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2087
                           (__vector unsigned char)__t);
2088
}
2089
 
2090
extern __inline void
2091
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2092
    _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2093
  __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2094
  __v16qu __mask, __tmp;
2095
  __m128i_u *__p = (__m128i_u *)__C;
2096
 
2097
  __tmp = (__v16qu)_mm_loadu_si128(__p);
2098
  __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2099
  __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2100
  _mm_storeu_si128(__p, (__m128i)__tmp);
2101
}
2102
 
2103
extern __inline __m128i
2104
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2105
    _mm_avg_epu8(__m128i __A, __m128i __B) {
2106
  return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2107
}
2108
 
2109
extern __inline __m128i
2110
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2111
    _mm_avg_epu16(__m128i __A, __m128i __B) {
2112
  return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2113
}
2114
 
2115
extern __inline __m128i
2116
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2117
    _mm_sad_epu8(__m128i __A, __m128i __B) {
2118
  __v16qu __a, __b;
2119
  __v16qu __vabsdiff;
2120
  __v4si __vsum;
2121
  const __v4su __zero = {0, 0, 0, 0};
2122
  __v4si __result;
2123
 
2124
  __a = (__v16qu)__A;
2125
  __b = (__v16qu)__B;
2126
#ifndef _ARCH_PWR9
2127
  __v16qu __vmin = vec_min(__a, __b);
2128
  __v16qu __vmax = vec_max(__a, __b);
2129
  __vabsdiff = vec_sub(__vmax, __vmin);
2130
#else
2131
  __vabsdiff = vec_absd(__a, __b);
2132
#endif
2133
  /* Sum four groups of bytes into integers.  */
2134
  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2135
#ifdef __LITTLE_ENDIAN__
2136
  /* Sum across four integers with two integer results.  */
2137
  __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2138
  /* Note: vec_sum2s could be used here, but on little-endian, vector
2139
     shifts are added that are not needed for this use-case.
2140
     A vector shift to correctly position the 32-bit integer results
2141
     (currently at [0] and [2]) to [1] and [3] would then need to be
2142
     swapped back again since the desired results are two 64-bit
2143
     integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2144
#else
2145
  /* Sum across four integers with two integer results.  */
2146
  __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2147
  /* Rotate the sums into the correct position.  */
2148
  __result = vec_sld(__result, __result, 6);
2149
#endif
2150
  return (__m128i)__result;
2151
}
2152
 
2153
extern __inline void
2154
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2155
    _mm_stream_si32(int *__A, int __B) {
2156
  /* Use the data cache block touch for store transient.  */
2157
  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2158
  *__A = __B;
2159
}
2160
 
2161
extern __inline void
2162
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2163
    _mm_stream_si64(long long int *__A, long long int __B) {
2164
  /* Use the data cache block touch for store transient.  */
2165
  __asm__("     dcbtstt 0,%0" : : "b"(__A) : "memory");
2166
  *__A = __B;
2167
}
2168
 
2169
extern __inline void
2170
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2171
    _mm_stream_si128(__m128i *__A, __m128i __B) {
2172
  /* Use the data cache block touch for store transient.  */
2173
  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2174
  *__A = __B;
2175
}
2176
 
2177
extern __inline void
2178
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2179
    _mm_stream_pd(double *__A, __m128d __B) {
2180
  /* Use the data cache block touch for store transient.  */
2181
  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2182
  *(__m128d *)__A = __B;
2183
}
2184
 
2185
extern __inline void
2186
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2187
    _mm_clflush(void const *__A) {
2188
  /* Use the data cache block flush.  */
2189
  __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2190
}
2191
 
2192
extern __inline void
2193
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2194
    _mm_lfence(void) {
2195
  /* Use light weight sync for load to load ordering.  */
2196
  __atomic_thread_fence(__ATOMIC_RELEASE);
2197
}
2198
 
2199
extern __inline void
2200
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2201
    _mm_mfence(void) {
2202
  /* Use heavy weight sync for any to any ordering.  */
2203
  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2204
}
2205
 
2206
extern __inline __m128i
2207
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2208
    _mm_cvtsi32_si128(int __A) {
2209
  return _mm_set_epi32(0, 0, 0, __A);
2210
}
2211
 
2212
extern __inline __m128i
2213
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2214
    _mm_cvtsi64_si128(long long __A) {
2215
  return __extension__(__m128i)(__v2di){__A, 0LL};
2216
}
2217
 
2218
/* Microsoft intrinsic.  */
2219
extern __inline __m128i
2220
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2221
    _mm_cvtsi64x_si128(long long __A) {
2222
  return __extension__(__m128i)(__v2di){__A, 0LL};
2223
}
2224
 
2225
/* Casts between various SP, DP, INT vector types.  Note that these do no
2226
   conversion of values, they just change the type.  */
2227
extern __inline __m128
2228
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2229
    _mm_castpd_ps(__m128d __A) {
2230
  return (__m128)__A;
2231
}
2232
 
2233
extern __inline __m128i
2234
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2235
    _mm_castpd_si128(__m128d __A) {
2236
  return (__m128i)__A;
2237
}
2238
 
2239
extern __inline __m128d
2240
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2241
    _mm_castps_pd(__m128 __A) {
2242
  return (__m128d)__A;
2243
}
2244
 
2245
extern __inline __m128i
2246
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2247
    _mm_castps_si128(__m128 __A) {
2248
  return (__m128i)__A;
2249
}
2250
 
2251
extern __inline __m128
2252
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2253
    _mm_castsi128_ps(__m128i __A) {
2254
  return (__m128)__A;
2255
}
2256
 
2257
extern __inline __m128d
2258
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2259
    _mm_castsi128_pd(__m128i __A) {
2260
  return (__m128d)__A;
2261
}
2262
 
2263
#else
2264
#include_next <emmintrin.h>
2265
#endif /* defined(__powerpc64__) &&                                            \
2266
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2267
 
2268
#endif /* EMMINTRIN_H_ */