Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
/* Implemented from the specification included in the Intel C++ Compiler
11
   User Guide and Reference, version 9.0.  */
12
 
13
#ifndef NO_WARN_X86_INTRINSICS
14
/* This header file is to help porting code using Intel intrinsics
15
   explicitly from x86_64 to powerpc64/powerpc64le.
16
 
17
   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18
   VMX/VSX ISA is a good match for vector float SIMD operations.
19
   However scalar float operations in vector (XMM) registers require
20
   the POWER8 VSX ISA (2.07) level. There are differences for data
21
   format and placement of float scalars in the vector register, which
22
   require extra steps to match SSE scalar float semantics on POWER.
23
 
24
   It should be noted that there's much difference between X86_64's
25
   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26
   portable <fenv.h> instead of access MXSCR directly.
27
 
28
   Most SSE scalar float intrinsic operations can be performed more
29
   efficiently as C language float scalar operations or optimized to
30
   use vector SIMD operations. We recommend this for new applications. */
31
#error                                                                         \
32
    "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33
#endif
34
 
35
#ifndef XMMINTRIN_H_
36
#define XMMINTRIN_H_
37
 
38
#if defined(__powerpc64__) &&                                                  \
39
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40
 
41
/* Define four value permute mask */
42
#define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
43
 
44
#include <altivec.h>
45
 
46
/* Avoid collisions between altivec.h and strict adherence to C++ and
47
   C11 standards.  This should eventually be done inside altivec.h itself,
48
   but only after testing a full distro build.  */
49
#if defined(__STRICT_ANSI__) &&                                                \
50
    (defined(__cplusplus) ||                                                   \
51
     (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
52
#undef vector
53
#undef pixel
54
#undef bool
55
#endif
56
 
57
/* We need type definitions from the MMX header file.  */
58
#include <mmintrin.h>
59
 
60
/* Get _mm_malloc () and _mm_free ().  */
61
#if __STDC_HOSTED__
62
#include <mm_malloc.h>
63
#endif
64
 
65
/* The Intel API is flexible enough that we must allow aliasing with other
66
   vector types, and their scalar components.  */
67
typedef vector float __m128 __attribute__((__may_alias__));
68
 
69
/* Unaligned version of the same type.  */
70
typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
71
 
72
/* Internal data types for implementing the intrinsics.  */
73
typedef vector float __v4sf;
74
 
75
/* Create an undefined vector.  */
76
extern __inline __m128
77
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78
    _mm_undefined_ps(void) {
79
  __m128 __Y = __Y;
80
  return __Y;
81
}
82
 
83
/* Create a vector of zeros.  */
84
extern __inline __m128
85
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86
    _mm_setzero_ps(void) {
87
  return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f};
88
}
89
 
90
/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
91
extern __inline __m128
92
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93
    _mm_load_ps(float const *__P) {
94
  return ((__m128)vec_ld(0, (__v4sf *)__P));
95
}
96
 
97
/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
98
extern __inline __m128
99
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100
    _mm_loadu_ps(float const *__P) {
101
  return (vec_vsx_ld(0, __P));
102
}
103
 
104
/* Load four SPFP values in reverse order.  The address must be aligned.  */
105
extern __inline __m128
106
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107
    _mm_loadr_ps(float const *__P) {
108
  __v4sf __tmp;
109
  __m128 __result;
110
  static const __vector unsigned char __permute_vector = {
111
      0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
112
      0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
113
 
114
  __tmp = vec_ld(0, (__v4sf *)__P);
115
  __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector);
116
  return __result;
117
}
118
 
119
/* Create a vector with all four elements equal to F.  */
120
extern __inline __m128
121
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122
    _mm_set1_ps(float __F) {
123
  return __extension__(__m128)(__v4sf){__F, __F, __F, __F};
124
}
125
 
126
extern __inline __m128
127
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128
    _mm_set_ps1(float __F) {
129
  return _mm_set1_ps(__F);
130
}
131
 
132
/* Create the vector [Z Y X W].  */
133
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
134
                                      __artificial__))
135
_mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) {
136
  return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z};
137
}
138
 
139
/* Create the vector [W X Y Z].  */
140
extern __inline __m128
141
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142
    _mm_setr_ps(float __Z, float __Y, float __X, float __W) {
143
  return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W};
144
}
145
 
146
/* Store four SPFP values.  The address must be 16-byte aligned.  */
147
extern __inline void
148
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149
    _mm_store_ps(float *__P, __m128 __A) {
150
  vec_st((__v4sf)__A, 0, (__v4sf *)__P);
151
}
152
 
153
/* Store four SPFP values.  The address need not be 16-byte aligned.  */
154
extern __inline void
155
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156
    _mm_storeu_ps(float *__P, __m128 __A) {
157
  *(__m128_u *)__P = __A;
158
}
159
 
160
/* Store four SPFP values in reverse order.  The address must be aligned.  */
161
extern __inline void
162
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163
    _mm_storer_ps(float *__P, __m128 __A) {
164
  __v4sf __tmp;
165
  static const __vector unsigned char __permute_vector = {
166
      0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B,
167
      0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13};
168
 
169
  __tmp = (__m128)vec_perm(__A, __A, __permute_vector);
170
 
171
  _mm_store_ps(__P, __tmp);
172
}
173
 
174
/* Store the lower SPFP value across four words.  */
175
extern __inline void
176
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177
    _mm_store1_ps(float *__P, __m128 __A) {
178
  __v4sf __va = vec_splat((__v4sf)__A, 0);
179
  _mm_store_ps(__P, __va);
180
}
181
 
182
extern __inline void
183
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
184
    _mm_store_ps1(float *__P, __m128 __A) {
185
  _mm_store1_ps(__P, __A);
186
}
187
 
188
/* Create a vector with element 0 as F and the rest zero.  */
189
extern __inline __m128
190
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191
    _mm_set_ss(float __F) {
192
  return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f};
193
}
194
 
195
/* Sets the low SPFP value of A from the low value of B.  */
196
extern __inline __m128
197
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198
    _mm_move_ss(__m128 __A, __m128 __B) {
199
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
200
 
201
  return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask));
202
}
203
 
204
/* Create a vector with element 0 as *P and the rest zero.  */
205
extern __inline __m128
206
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
207
    _mm_load_ss(float const *__P) {
208
  return _mm_set_ss(*__P);
209
}
210
 
211
/* Stores the lower SPFP value.  */
212
extern __inline void
213
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214
    _mm_store_ss(float *__P, __m128 __A) {
215
  *__P = ((__v4sf)__A)[0];
216
}
217
 
218
/* Perform the respective operation on the lower SPFP (single-precision
219
   floating-point) values of A and B; the upper three SPFP values are
220
   passed through from A.  */
221
 
222
extern __inline __m128
223
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
224
    _mm_add_ss(__m128 __A, __m128 __B) {
225
#ifdef _ARCH_PWR7
226
  __m128 __a, __b, __c;
227
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
228
  /* PowerISA VSX does not allow partial (for just lower double)
229
     results. So to insure we don't generate spurious exceptions
230
     (from the upper double values) we splat the lower double
231
     before we to the operation.  */
232
  __a = vec_splat(__A, 0);
233
  __b = vec_splat(__B, 0);
234
  __c = __a + __b;
235
  /* Then we merge the lower float result with the original upper
236
     float elements from __A.  */
237
  return (vec_sel(__A, __c, __mask));
238
#else
239
  __A[0] = __A[0] + __B[0];
240
  return (__A);
241
#endif
242
}
243
 
244
extern __inline __m128
245
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
246
    _mm_sub_ss(__m128 __A, __m128 __B) {
247
#ifdef _ARCH_PWR7
248
  __m128 __a, __b, __c;
249
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250
  /* PowerISA VSX does not allow partial (for just lower double)
251
     results. So to insure we don't generate spurious exceptions
252
     (from the upper double values) we splat the lower double
253
     before we to the operation.  */
254
  __a = vec_splat(__A, 0);
255
  __b = vec_splat(__B, 0);
256
  __c = __a - __b;
257
  /* Then we merge the lower float result with the original upper
258
     float elements from __A.  */
259
  return (vec_sel(__A, __c, __mask));
260
#else
261
  __A[0] = __A[0] - __B[0];
262
  return (__A);
263
#endif
264
}
265
 
266
extern __inline __m128
267
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268
    _mm_mul_ss(__m128 __A, __m128 __B) {
269
#ifdef _ARCH_PWR7
270
  __m128 __a, __b, __c;
271
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272
  /* PowerISA VSX does not allow partial (for just lower double)
273
     results. So to insure we don't generate spurious exceptions
274
     (from the upper double values) we splat the lower double
275
     before we to the operation.  */
276
  __a = vec_splat(__A, 0);
277
  __b = vec_splat(__B, 0);
278
  __c = __a * __b;
279
  /* Then we merge the lower float result with the original upper
280
     float elements from __A.  */
281
  return (vec_sel(__A, __c, __mask));
282
#else
283
  __A[0] = __A[0] * __B[0];
284
  return (__A);
285
#endif
286
}
287
 
288
extern __inline __m128
289
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290
    _mm_div_ss(__m128 __A, __m128 __B) {
291
#ifdef _ARCH_PWR7
292
  __m128 __a, __b, __c;
293
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294
  /* PowerISA VSX does not allow partial (for just lower double)
295
     results. So to insure we don't generate spurious exceptions
296
     (from the upper double values) we splat the lower double
297
     before we to the operation.  */
298
  __a = vec_splat(__A, 0);
299
  __b = vec_splat(__B, 0);
300
  __c = __a / __b;
301
  /* Then we merge the lower float result with the original upper
302
     float elements from __A.  */
303
  return (vec_sel(__A, __c, __mask));
304
#else
305
  __A[0] = __A[0] / __B[0];
306
  return (__A);
307
#endif
308
}
309
 
310
extern __inline __m128
311
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312
    _mm_sqrt_ss(__m128 __A) {
313
  __m128 __a, __c;
314
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
315
  /* PowerISA VSX does not allow partial (for just lower double)
316
   * results. So to insure we don't generate spurious exceptions
317
   * (from the upper double values) we splat the lower double
318
   * before we to the operation. */
319
  __a = vec_splat(__A, 0);
320
  __c = vec_sqrt(__a);
321
  /* Then we merge the lower float result with the original upper
322
   * float elements from __A.  */
323
  return (vec_sel(__A, __c, __mask));
324
}
325
 
326
/* Perform the respective operation on the four SPFP values in A and B.  */
327
extern __inline __m128
328
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329
    _mm_add_ps(__m128 __A, __m128 __B) {
330
  return (__m128)((__v4sf)__A + (__v4sf)__B);
331
}
332
 
333
extern __inline __m128
334
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335
    _mm_sub_ps(__m128 __A, __m128 __B) {
336
  return (__m128)((__v4sf)__A - (__v4sf)__B);
337
}
338
 
339
extern __inline __m128
340
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341
    _mm_mul_ps(__m128 __A, __m128 __B) {
342
  return (__m128)((__v4sf)__A * (__v4sf)__B);
343
}
344
 
345
extern __inline __m128
346
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347
    _mm_div_ps(__m128 __A, __m128 __B) {
348
  return (__m128)((__v4sf)__A / (__v4sf)__B);
349
}
350
 
351
extern __inline __m128
352
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353
    _mm_sqrt_ps(__m128 __A) {
354
  return (vec_sqrt((__v4sf)__A));
355
}
356
 
357
extern __inline __m128
358
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359
    _mm_rcp_ps(__m128 __A) {
360
  return (vec_re((__v4sf)__A));
361
}
362
 
363
extern __inline __m128
364
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
365
    _mm_rsqrt_ps(__m128 __A) {
366
  return (vec_rsqrte(__A));
367
}
368
 
369
extern __inline __m128
370
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371
    _mm_rcp_ss(__m128 __A) {
372
  __m128 __a, __c;
373
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
374
  /* PowerISA VSX does not allow partial (for just lower double)
375
   * results. So to insure we don't generate spurious exceptions
376
   * (from the upper double values) we splat the lower double
377
   * before we to the operation. */
378
  __a = vec_splat(__A, 0);
379
  __c = _mm_rcp_ps(__a);
380
  /* Then we merge the lower float result with the original upper
381
   * float elements from __A.  */
382
  return (vec_sel(__A, __c, __mask));
383
}
384
 
385
extern __inline __m128
386
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
387
    _mm_rsqrt_ss(__m128 __A) {
388
  __m128 __a, __c;
389
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
390
  /* PowerISA VSX does not allow partial (for just lower double)
391
   * results. So to insure we don't generate spurious exceptions
392
   * (from the upper double values) we splat the lower double
393
   * before we to the operation. */
394
  __a = vec_splat(__A, 0);
395
  __c = vec_rsqrte(__a);
396
  /* Then we merge the lower float result with the original upper
397
   * float elements from __A.  */
398
  return (vec_sel(__A, __c, __mask));
399
}
400
 
401
extern __inline __m128
402
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
403
    _mm_min_ss(__m128 __A, __m128 __B) {
404
  __v4sf __a, __b, __c;
405
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
406
  /* PowerISA VSX does not allow partial (for just lower float)
407
   * results. So to insure we don't generate spurious exceptions
408
   * (from the upper float values) we splat the lower float
409
   * before we to the operation. */
410
  __a = vec_splat((__v4sf)__A, 0);
411
  __b = vec_splat((__v4sf)__B, 0);
412
  __c = vec_min(__a, __b);
413
  /* Then we merge the lower float result with the original upper
414
   * float elements from __A.  */
415
  return (vec_sel((__v4sf)__A, __c, __mask));
416
}
417
 
418
extern __inline __m128
419
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420
    _mm_max_ss(__m128 __A, __m128 __B) {
421
  __v4sf __a, __b, __c;
422
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
423
  /* PowerISA VSX does not allow partial (for just lower float)
424
   * results. So to insure we don't generate spurious exceptions
425
   * (from the upper float values) we splat the lower float
426
   * before we to the operation. */
427
  __a = vec_splat(__A, 0);
428
  __b = vec_splat(__B, 0);
429
  __c = vec_max(__a, __b);
430
  /* Then we merge the lower float result with the original upper
431
   * float elements from __A.  */
432
  return (vec_sel((__v4sf)__A, __c, __mask));
433
}
434
 
435
extern __inline __m128
436
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437
    _mm_min_ps(__m128 __A, __m128 __B) {
438
  __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A);
439
  return vec_sel(__B, __A, __m);
440
}
441
 
442
extern __inline __m128
443
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
444
    _mm_max_ps(__m128 __A, __m128 __B) {
445
  __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B);
446
  return vec_sel(__B, __A, __m);
447
}
448
 
449
/* Perform logical bit-wise operations on 128-bit values.  */
450
extern __inline __m128
451
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452
    _mm_and_ps(__m128 __A, __m128 __B) {
453
  return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B));
454
  //  return __builtin_ia32_andps (__A, __B);
455
}
456
 
457
extern __inline __m128
458
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459
    _mm_andnot_ps(__m128 __A, __m128 __B) {
460
  return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A));
461
}
462
 
463
extern __inline __m128
464
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465
    _mm_or_ps(__m128 __A, __m128 __B) {
466
  return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B));
467
}
468
 
469
extern __inline __m128
470
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
471
    _mm_xor_ps(__m128 __A, __m128 __B) {
472
  return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B));
473
}
474
 
475
/* Perform a comparison on the four SPFP values of A and B.  For each
476
   element, if the comparison is true, place a mask of all ones in the
477
   result, otherwise a mask of zeros.  */
478
extern __inline __m128
479
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
480
    _mm_cmpeq_ps(__m128 __A, __m128 __B) {
481
  return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B));
482
}
483
 
484
extern __inline __m128
485
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486
    _mm_cmplt_ps(__m128 __A, __m128 __B) {
487
  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
488
}
489
 
490
extern __inline __m128
491
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
492
    _mm_cmple_ps(__m128 __A, __m128 __B) {
493
  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
494
}
495
 
496
extern __inline __m128
497
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498
    _mm_cmpgt_ps(__m128 __A, __m128 __B) {
499
  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
500
}
501
 
502
extern __inline __m128
503
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504
    _mm_cmpge_ps(__m128 __A, __m128 __B) {
505
  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
506
}
507
 
508
extern __inline __m128
509
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510
    _mm_cmpneq_ps(__m128 __A, __m128 __B) {
511
  __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B);
512
  return ((__m128)vec_nor(__temp, __temp));
513
}
514
 
515
extern __inline __m128
516
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517
    _mm_cmpnlt_ps(__m128 __A, __m128 __B) {
518
  return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B));
519
}
520
 
521
extern __inline __m128
522
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
523
    _mm_cmpnle_ps(__m128 __A, __m128 __B) {
524
  return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B));
525
}
526
 
527
extern __inline __m128
528
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
529
    _mm_cmpngt_ps(__m128 __A, __m128 __B) {
530
  return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B));
531
}
532
 
533
extern __inline __m128
534
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535
    _mm_cmpnge_ps(__m128 __A, __m128 __B) {
536
  return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B));
537
}
538
 
539
extern __inline __m128
540
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
541
    _mm_cmpord_ps(__m128 __A, __m128 __B) {
542
  __vector unsigned int __a, __b;
543
  __vector unsigned int __c, __d;
544
  static const __vector unsigned int __float_exp_mask = {
545
      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
546
 
547
  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
548
  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
549
  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
550
  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
551
  return ((__m128)vec_and(__c, __d));
552
}
553
 
554
extern __inline __m128
555
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556
    _mm_cmpunord_ps(__m128 __A, __m128 __B) {
557
  __vector unsigned int __a, __b;
558
  __vector unsigned int __c, __d;
559
  static const __vector unsigned int __float_exp_mask = {
560
      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
561
 
562
  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
563
  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
564
  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
565
  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
566
  return ((__m128)vec_or(__c, __d));
567
}
568
 
569
/* Perform a comparison on the lower SPFP values of A and B.  If the
570
   comparison is true, place a mask of all ones in the result, otherwise a
571
   mask of zeros.  The upper three SPFP values are passed through from A.  */
572
extern __inline __m128
573
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574
    _mm_cmpeq_ss(__m128 __A, __m128 __B) {
575
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
576
  __v4sf __a, __b, __c;
577
  /* PowerISA VMX does not allow partial (for just element 0)
578
   * results. So to insure we don't generate spurious exceptions
579
   * (from the upper elements) we splat the lower float
580
   * before we to the operation. */
581
  __a = vec_splat((__v4sf)__A, 0);
582
  __b = vec_splat((__v4sf)__B, 0);
583
  __c = (__v4sf)vec_cmpeq(__a, __b);
584
  /* Then we merge the lower float result with the original upper
585
   * float elements from __A.  */
586
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
587
}
588
 
589
extern __inline __m128
590
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591
    _mm_cmplt_ss(__m128 __A, __m128 __B) {
592
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
593
  __v4sf __a, __b, __c;
594
  /* PowerISA VMX does not allow partial (for just element 0)
595
   * results. So to insure we don't generate spurious exceptions
596
   * (from the upper elements) we splat the lower float
597
   * before we to the operation. */
598
  __a = vec_splat((__v4sf)__A, 0);
599
  __b = vec_splat((__v4sf)__B, 0);
600
  __c = (__v4sf)vec_cmplt(__a, __b);
601
  /* Then we merge the lower float result with the original upper
602
   * float elements from __A.  */
603
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
604
}
605
 
606
extern __inline __m128
607
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608
    _mm_cmple_ss(__m128 __A, __m128 __B) {
609
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
610
  __v4sf __a, __b, __c;
611
  /* PowerISA VMX does not allow partial (for just element 0)
612
   * results. So to insure we don't generate spurious exceptions
613
   * (from the upper elements) we splat the lower float
614
   * before we to the operation. */
615
  __a = vec_splat((__v4sf)__A, 0);
616
  __b = vec_splat((__v4sf)__B, 0);
617
  __c = (__v4sf)vec_cmple(__a, __b);
618
  /* Then we merge the lower float result with the original upper
619
   * float elements from __A.  */
620
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
621
}
622
 
623
extern __inline __m128
624
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625
    _mm_cmpgt_ss(__m128 __A, __m128 __B) {
626
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
627
  __v4sf __a, __b, __c;
628
  /* PowerISA VMX does not allow partial (for just element 0)
629
   * results. So to insure we don't generate spurious exceptions
630
   * (from the upper elements) we splat the lower float
631
   * before we to the operation. */
632
  __a = vec_splat((__v4sf)__A, 0);
633
  __b = vec_splat((__v4sf)__B, 0);
634
  __c = (__v4sf)vec_cmpgt(__a, __b);
635
  /* Then we merge the lower float result with the original upper
636
   * float elements from __A.  */
637
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
638
}
639
 
640
extern __inline __m128
641
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642
    _mm_cmpge_ss(__m128 __A, __m128 __B) {
643
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
644
  __v4sf __a, __b, __c;
645
  /* PowerISA VMX does not allow partial (for just element 0)
646
   * results. So to insure we don't generate spurious exceptions
647
   * (from the upper elements) we splat the lower float
648
   * before we to the operation. */
649
  __a = vec_splat((__v4sf)__A, 0);
650
  __b = vec_splat((__v4sf)__B, 0);
651
  __c = (__v4sf)vec_cmpge(__a, __b);
652
  /* Then we merge the lower float result with the original upper
653
   * float elements from __A.  */
654
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
655
}
656
 
657
extern __inline __m128
658
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
659
    _mm_cmpneq_ss(__m128 __A, __m128 __B) {
660
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
661
  __v4sf __a, __b, __c;
662
  /* PowerISA VMX does not allow partial (for just element 0)
663
   * results. So to insure we don't generate spurious exceptions
664
   * (from the upper elements) we splat the lower float
665
   * before we to the operation. */
666
  __a = vec_splat((__v4sf)__A, 0);
667
  __b = vec_splat((__v4sf)__B, 0);
668
  __c = (__v4sf)vec_cmpeq(__a, __b);
669
  __c = vec_nor(__c, __c);
670
  /* Then we merge the lower float result with the original upper
671
   * float elements from __A.  */
672
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
673
}
674
 
675
extern __inline __m128
676
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677
    _mm_cmpnlt_ss(__m128 __A, __m128 __B) {
678
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
679
  __v4sf __a, __b, __c;
680
  /* PowerISA VMX does not allow partial (for just element 0)
681
   * results. So to insure we don't generate spurious exceptions
682
   * (from the upper elements) we splat the lower float
683
   * before we to the operation. */
684
  __a = vec_splat((__v4sf)__A, 0);
685
  __b = vec_splat((__v4sf)__B, 0);
686
  __c = (__v4sf)vec_cmpge(__a, __b);
687
  /* Then we merge the lower float result with the original upper
688
   * float elements from __A.  */
689
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
690
}
691
 
692
extern __inline __m128
693
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694
    _mm_cmpnle_ss(__m128 __A, __m128 __B) {
695
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
696
  __v4sf __a, __b, __c;
697
  /* PowerISA VMX does not allow partial (for just element 0)
698
   * results. So to insure we don't generate spurious exceptions
699
   * (from the upper elements) we splat the lower float
700
   * before we to the operation. */
701
  __a = vec_splat((__v4sf)__A, 0);
702
  __b = vec_splat((__v4sf)__B, 0);
703
  __c = (__v4sf)vec_cmpgt(__a, __b);
704
  /* Then we merge the lower float result with the original upper
705
   * float elements from __A.  */
706
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
707
}
708
 
709
extern __inline __m128
710
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711
    _mm_cmpngt_ss(__m128 __A, __m128 __B) {
712
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
713
  __v4sf __a, __b, __c;
714
  /* PowerISA VMX does not allow partial (for just element 0)
715
   * results. So to insure we don't generate spurious exceptions
716
   * (from the upper elements) we splat the lower float
717
   * before we to the operation. */
718
  __a = vec_splat((__v4sf)__A, 0);
719
  __b = vec_splat((__v4sf)__B, 0);
720
  __c = (__v4sf)vec_cmple(__a, __b);
721
  /* Then we merge the lower float result with the original upper
722
   * float elements from __A.  */
723
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
724
}
725
 
726
extern __inline __m128
727
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728
    _mm_cmpnge_ss(__m128 __A, __m128 __B) {
729
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
730
  __v4sf __a, __b, __c;
731
  /* PowerISA VMX does not allow partial (for just element 0)
732
   * results. So to insure we don't generate spurious exceptions
733
   * (from the upper elements) we splat the lower float
734
   * before we do the operation. */
735
  __a = vec_splat((__v4sf)__A, 0);
736
  __b = vec_splat((__v4sf)__B, 0);
737
  __c = (__v4sf)vec_cmplt(__a, __b);
738
  /* Then we merge the lower float result with the original upper
739
   * float elements from __A.  */
740
  return ((__m128)vec_sel((__v4sf)__A, __c, __mask));
741
}
742
 
743
extern __inline __m128
744
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745
    _mm_cmpord_ss(__m128 __A, __m128 __B) {
746
  __vector unsigned int __a, __b;
747
  __vector unsigned int __c, __d;
748
  static const __vector unsigned int __float_exp_mask = {
749
      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
750
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
751
 
752
  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
753
  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
754
  __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a);
755
  __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b);
756
  __c = vec_and(__c, __d);
757
  /* Then we merge the lower float result with the original upper
758
   * float elements from __A.  */
759
  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
760
}
761
 
762
extern __inline __m128
763
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764
    _mm_cmpunord_ss(__m128 __A, __m128 __B) {
765
  __vector unsigned int __a, __b;
766
  __vector unsigned int __c, __d;
767
  static const __vector unsigned int __float_exp_mask = {
768
      0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000};
769
  static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
770
 
771
  __a = (__vector unsigned int)vec_abs((__v4sf)__A);
772
  __b = (__vector unsigned int)vec_abs((__v4sf)__B);
773
  __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask);
774
  __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask);
775
  __c = vec_or(__c, __d);
776
  /* Then we merge the lower float result with the original upper
777
   * float elements from __A.  */
778
  return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask));
779
}
780
 
781
/* Compare the lower SPFP values of A and B and return 1 if true
782
   and 0 if false.  */
783
extern __inline int
784
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
785
    _mm_comieq_ss(__m128 __A, __m128 __B) {
786
  return (__A[0] == __B[0]);
787
}
788
 
789
extern __inline int
790
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
791
    _mm_comilt_ss(__m128 __A, __m128 __B) {
792
  return (__A[0] < __B[0]);
793
}
794
 
795
extern __inline int
796
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797
    _mm_comile_ss(__m128 __A, __m128 __B) {
798
  return (__A[0] <= __B[0]);
799
}
800
 
801
extern __inline int
802
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803
    _mm_comigt_ss(__m128 __A, __m128 __B) {
804
  return (__A[0] > __B[0]);
805
}
806
 
807
extern __inline int
808
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809
    _mm_comige_ss(__m128 __A, __m128 __B) {
810
  return (__A[0] >= __B[0]);
811
}
812
 
813
extern __inline int
814
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
815
    _mm_comineq_ss(__m128 __A, __m128 __B) {
816
  return (__A[0] != __B[0]);
817
}
818
 
819
/* FIXME
820
 * The __mm_ucomi??_ss implementations below are exactly the same as
821
 * __mm_comi??_ss because GCC for PowerPC only generates unordered
822
 * compares (scalar and vector).
823
 * Technically __mm_comieq_ss et al should be using the ordered
824
 * compare and signal for QNaNs.
825
 * The __mm_ucomieq_sd et all should be OK, as is.
826
 */
827
extern __inline int
828
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
829
    _mm_ucomieq_ss(__m128 __A, __m128 __B) {
830
  return (__A[0] == __B[0]);
831
}
832
 
833
extern __inline int
834
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
835
    _mm_ucomilt_ss(__m128 __A, __m128 __B) {
836
  return (__A[0] < __B[0]);
837
}
838
 
839
extern __inline int
840
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841
    _mm_ucomile_ss(__m128 __A, __m128 __B) {
842
  return (__A[0] <= __B[0]);
843
}
844
 
845
extern __inline int
846
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
847
    _mm_ucomigt_ss(__m128 __A, __m128 __B) {
848
  return (__A[0] > __B[0]);
849
}
850
 
851
extern __inline int
852
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
853
    _mm_ucomige_ss(__m128 __A, __m128 __B) {
854
  return (__A[0] >= __B[0]);
855
}
856
 
857
extern __inline int
858
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
859
    _mm_ucomineq_ss(__m128 __A, __m128 __B) {
860
  return (__A[0] != __B[0]);
861
}
862
 
863
extern __inline float
864
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865
    _mm_cvtss_f32(__m128 __A) {
866
  return ((__v4sf)__A)[0];
867
}
868
 
869
/* Convert the lower SPFP value to a 32-bit integer according to the current
870
   rounding mode.  */
871
extern __inline int
872
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873
    _mm_cvtss_si32(__m128 __A) {
874
  int __res;
875
#ifdef _ARCH_PWR8
876
  double __dtmp;
877
  __asm__(
878
#ifdef __LITTLE_ENDIAN__
879
      "xxsldwi %x0,%x0,%x0,3;\n"
880
#endif
881
      "xscvspdp %x2,%x0;\n"
882
      "fctiw  %2,%2;\n"
883
      "mfvsrd  %1,%x2;\n"
884
      : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
885
      :);
886
#else
887
  __res = __builtin_rint(__A[0]);
888
#endif
889
  return __res;
890
}
891
 
892
extern __inline int
893
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894
    _mm_cvt_ss2si(__m128 __A) {
895
  return _mm_cvtss_si32(__A);
896
}
897
 
898
/* Convert the lower SPFP value to a 32-bit integer according to the
899
   current rounding mode.  */
900
 
901
/* Intel intrinsic.  */
902
extern __inline long long
903
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904
    _mm_cvtss_si64(__m128 __A) {
905
  long long __res;
906
#if defined(_ARCH_PWR8) && defined(__powerpc64__)
907
  double __dtmp;
908
  __asm__(
909
#ifdef __LITTLE_ENDIAN__
910
      "xxsldwi %x0,%x0,%x0,3;\n"
911
#endif
912
      "xscvspdp %x2,%x0;\n"
913
      "fctid  %2,%2;\n"
914
      "mfvsrd  %1,%x2;\n"
915
      : "+wa"(__A), "=r"(__res), "=f"(__dtmp)
916
      :);
917
#else
918
  __res = __builtin_llrint(__A[0]);
919
#endif
920
  return __res;
921
}
922
 
923
/* Microsoft intrinsic.  */
924
extern __inline long long
925
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926
    _mm_cvtss_si64x(__m128 __A) {
927
  return _mm_cvtss_si64((__v4sf)__A);
928
}
929
 
930
/* Constants for use with _mm_prefetch.  */
931
enum _mm_hint {
932
  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
933
  _MM_HINT_ET0 = 7,
934
  _MM_HINT_ET1 = 6,
935
  _MM_HINT_T0 = 3,
936
  _MM_HINT_T1 = 2,
937
  _MM_HINT_T2 = 1,
938
  _MM_HINT_NTA = 0
939
};
940
 
941
/* Loads one cache line from address P to a location "closer" to the
942
   processor.  The selector I specifies the type of prefetch operation.  */
943
extern __inline void
944
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945
    _mm_prefetch(const void *__P, enum _mm_hint __I) {
946
  /* Current PowerPC will ignores the hint parameters.  */
947
  __builtin_prefetch(__P);
948
}
949
 
950
/* Convert the two lower SPFP values to 32-bit integers according to the
951
   current rounding mode.  Return the integers in packed form.  */
952
extern __inline __m64
953
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
954
    _mm_cvtps_pi32(__m128 __A) {
955
  /* Splat two lower SPFP values to both halves.  */
956
  __v4sf __temp, __rounded;
957
  __vector unsigned long long __result;
958
 
959
  /* Splat two lower SPFP values to both halves.  */
960
  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
961
  __rounded = vec_rint(__temp);
962
  __result = (__vector unsigned long long)vec_cts(__rounded, 0);
963
 
964
  return (__m64)((__vector long long)__result)[0];
965
}
966
 
967
extern __inline __m64
968
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969
    _mm_cvt_ps2pi(__m128 __A) {
970
  return _mm_cvtps_pi32(__A);
971
}
972
 
973
/* Truncate the lower SPFP value to a 32-bit integer.  */
974
extern __inline int
975
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
976
    _mm_cvttss_si32(__m128 __A) {
977
  /* Extract the lower float element.  */
978
  float __temp = __A[0];
979
  /* truncate to 32-bit integer and return.  */
980
  return __temp;
981
}
982
 
983
extern __inline int
984
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
985
    _mm_cvtt_ss2si(__m128 __A) {
986
  return _mm_cvttss_si32(__A);
987
}
988
 
989
/* Intel intrinsic.  */
990
extern __inline long long
991
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992
    _mm_cvttss_si64(__m128 __A) {
993
  /* Extract the lower float element.  */
994
  float __temp = __A[0];
995
  /* truncate to 32-bit integer and return.  */
996
  return __temp;
997
}
998
 
999
/* Microsoft intrinsic.  */
1000
extern __inline long long
1001
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1002
    _mm_cvttss_si64x(__m128 __A) {
1003
  /* Extract the lower float element.  */
1004
  float __temp = __A[0];
1005
  /* truncate to 32-bit integer and return.  */
1006
  return __temp;
1007
}
1008
 
1009
/* Truncate the two lower SPFP values to 32-bit integers.  Return the
1010
   integers in packed form.  */
1011
extern __inline __m64
1012
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013
    _mm_cvttps_pi32(__m128 __A) {
1014
  __v4sf __temp;
1015
  __vector unsigned long long __result;
1016
 
1017
  /* Splat two lower SPFP values to both halves.  */
1018
  __temp = (__v4sf)vec_splat((__vector long long)__A, 0);
1019
  __result = (__vector unsigned long long)vec_cts(__temp, 0);
1020
 
1021
  return (__m64)((__vector long long)__result)[0];
1022
}
1023
 
1024
extern __inline __m64
1025
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026
    _mm_cvtt_ps2pi(__m128 __A) {
1027
  return _mm_cvttps_pi32(__A);
1028
}
1029
 
1030
/* Convert B to a SPFP value and insert it as element zero in A.  */
1031
extern __inline __m128
1032
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033
    _mm_cvtsi32_ss(__m128 __A, int __B) {
1034
  float __temp = __B;
1035
  __A[0] = __temp;
1036
 
1037
  return __A;
1038
}
1039
 
1040
extern __inline __m128
1041
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042
    _mm_cvt_si2ss(__m128 __A, int __B) {
1043
  return _mm_cvtsi32_ss(__A, __B);
1044
}
1045
 
1046
/* Convert B to a SPFP value and insert it as element zero in A.  */
1047
/* Intel intrinsic.  */
1048
extern __inline __m128
1049
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050
    _mm_cvtsi64_ss(__m128 __A, long long __B) {
1051
  float __temp = __B;
1052
  __A[0] = __temp;
1053
 
1054
  return __A;
1055
}
1056
 
1057
/* Microsoft intrinsic.  */
1058
extern __inline __m128
1059
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060
    _mm_cvtsi64x_ss(__m128 __A, long long __B) {
1061
  return _mm_cvtsi64_ss(__A, __B);
1062
}
1063
 
1064
/* Convert the two 32-bit values in B to SPFP form and insert them
1065
   as the two lower elements in A.  */
1066
extern __inline __m128
1067
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068
    _mm_cvtpi32_ps(__m128 __A, __m64 __B) {
1069
  __vector signed int __vm1;
1070
  __vector float __vf1;
1071
 
1072
  __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B};
1073
  __vf1 = (__vector float)vec_ctf(__vm1, 0);
1074
 
1075
  return ((__m128)(__vector unsigned long long){
1076
      ((__vector unsigned long long)__vf1)[0],
1077
      ((__vector unsigned long long)__A)[1]});
1078
}
1079
 
1080
extern __inline __m128
1081
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082
    _mm_cvt_pi2ps(__m128 __A, __m64 __B) {
1083
  return _mm_cvtpi32_ps(__A, __B);
1084
}
1085
 
1086
/* Convert the four signed 16-bit values in A to SPFP form.  */
1087
extern __inline __m128
1088
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1089
    _mm_cvtpi16_ps(__m64 __A) {
1090
  __vector signed short __vs8;
1091
  __vector signed int __vi4;
1092
  __vector float __vf1;
1093
 
1094
  __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A};
1095
  __vi4 = vec_vupklsh(__vs8);
1096
  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1097
 
1098
  return (__m128)__vf1;
1099
}
1100
 
1101
/* Convert the four unsigned 16-bit values in A to SPFP form.  */
1102
extern __inline __m128
1103
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104
    _mm_cvtpu16_ps(__m64 __A) {
1105
  const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1106
  __vector unsigned short __vs8;
1107
  __vector unsigned int __vi4;
1108
  __vector float __vf1;
1109
 
1110
  __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A};
1111
  __vi4 = (__vector unsigned int)vec_mergel
1112
#ifdef __LITTLE_ENDIAN__
1113
      (__vs8, __zero);
1114
#else
1115
      (__zero, __vs8);
1116
#endif
1117
  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1118
 
1119
  return (__m128)__vf1;
1120
}
1121
 
1122
/* Convert the low four signed 8-bit values in A to SPFP form.  */
1123
extern __inline __m128
1124
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1125
    _mm_cvtpi8_ps(__m64 __A) {
1126
  __vector signed char __vc16;
1127
  __vector signed short __vs8;
1128
  __vector signed int __vi4;
1129
  __vector float __vf1;
1130
 
1131
  __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A};
1132
  __vs8 = vec_vupkhsb(__vc16);
1133
  __vi4 = vec_vupkhsh(__vs8);
1134
  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1135
 
1136
  return (__m128)__vf1;
1137
}
1138
 
1139
/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1140
extern __inline __m128
1141
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1142
 
1143
    _mm_cvtpu8_ps(__m64 __A) {
1144
  const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0};
1145
  __vector unsigned char __vc16;
1146
  __vector unsigned short __vs8;
1147
  __vector unsigned int __vi4;
1148
  __vector float __vf1;
1149
 
1150
  __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A};
1151
#ifdef __LITTLE_ENDIAN__
1152
  __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero);
1153
  __vi4 =
1154
      (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero);
1155
#else
1156
  __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16);
1157
  __vi4 =
1158
      (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8);
1159
#endif
1160
  __vf1 = (__vector float)vec_ctf(__vi4, 0);
1161
 
1162
  return (__m128)__vf1;
1163
}
1164
 
1165
/* Convert the four signed 32-bit values in A and B to SPFP form.  */
1166
extern __inline __m128
1167
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168
    _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) {
1169
  __vector signed int __vi4;
1170
  __vector float __vf4;
1171
 
1172
  __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B};
1173
  __vf4 = (__vector float)vec_ctf(__vi4, 0);
1174
  return (__m128)__vf4;
1175
}
1176
 
1177
/* Convert the four SPFP values in A to four signed 16-bit integers.  */
1178
extern __inline __m64
1179
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1180
    _mm_cvtps_pi16(__m128 __A) {
1181
  __v4sf __rounded;
1182
  __vector signed int __temp;
1183
  __vector unsigned long long __result;
1184
 
1185
  __rounded = vec_rint(__A);
1186
  __temp = vec_cts(__rounded, 0);
1187
  __result = (__vector unsigned long long)vec_pack(__temp, __temp);
1188
 
1189
  return (__m64)((__vector long long)__result)[0];
1190
}
1191
 
1192
/* Convert the four SPFP values in A to four signed 8-bit integers.  */
1193
extern __inline __m64
1194
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195
    _mm_cvtps_pi8(__m128 __A) {
1196
  __v4sf __rounded;
1197
  __vector signed int __tmp_i;
1198
  static const __vector signed int __zero = {0, 0, 0, 0};
1199
  __vector signed short __tmp_s;
1200
  __vector signed char __res_v;
1201
 
1202
  __rounded = vec_rint(__A);
1203
  __tmp_i = vec_cts(__rounded, 0);
1204
  __tmp_s = vec_pack(__tmp_i, __zero);
1205
  __res_v = vec_pack(__tmp_s, __tmp_s);
1206
  return (__m64)((__vector long long)__res_v)[0];
1207
}
1208
 
1209
/* Selects four specific SPFP values from A and B based on MASK.  */
1210
extern __inline __m128
1211
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212
 
1213
    _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) {
1214
  unsigned long __element_selector_10 = __mask & 0x03;
1215
  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1216
  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1217
  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1218
  static const unsigned int __permute_selectors[4] = {
1219
#ifdef __LITTLE_ENDIAN__
1220
      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1221
#else
1222
      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1223
#endif
1224
  };
1225
  __vector unsigned int __t;
1226
 
1227
  __t[0] = __permute_selectors[__element_selector_10];
1228
  __t[1] = __permute_selectors[__element_selector_32];
1229
  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1230
  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1231
  return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t);
1232
}
1233
 
1234
/* Selects and interleaves the upper two SPFP values from A and B.  */
1235
extern __inline __m128
1236
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1237
    _mm_unpackhi_ps(__m128 __A, __m128 __B) {
1238
  return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B);
1239
}
1240
 
1241
/* Selects and interleaves the lower two SPFP values from A and B.  */
1242
extern __inline __m128
1243
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244
    _mm_unpacklo_ps(__m128 __A, __m128 __B) {
1245
  return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B);
1246
}
1247
 
1248
/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1249
   the lower two values are passed through from A.  */
1250
extern __inline __m128
1251
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252
    _mm_loadh_pi(__m128 __A, __m64 const *__P) {
1253
  __vector unsigned long long __a = (__vector unsigned long long)__A;
1254
  __vector unsigned long long __p = vec_splats(*__P);
1255
  __a[1] = __p[1];
1256
 
1257
  return (__m128)__a;
1258
}
1259
 
1260
/* Stores the upper two SPFP values of A into P.  */
1261
extern __inline void
1262
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263
    _mm_storeh_pi(__m64 *__P, __m128 __A) {
1264
  __vector unsigned long long __a = (__vector unsigned long long)__A;
1265
 
1266
  *__P = __a[1];
1267
}
1268
 
1269
/* Moves the upper two values of B into the lower two values of A.  */
1270
extern __inline __m128
1271
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272
    _mm_movehl_ps(__m128 __A, __m128 __B) {
1273
  return (__m128)vec_mergel((__vector unsigned long long)__B,
1274
                            (__vector unsigned long long)__A);
1275
}
1276
 
1277
/* Moves the lower two values of B into the upper two values of A.  */
1278
extern __inline __m128
1279
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280
    _mm_movelh_ps(__m128 __A, __m128 __B) {
1281
  return (__m128)vec_mergeh((__vector unsigned long long)__A,
1282
                            (__vector unsigned long long)__B);
1283
}
1284
 
1285
/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1286
   the upper two values are passed through from A.  */
1287
extern __inline __m128
1288
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289
    _mm_loadl_pi(__m128 __A, __m64 const *__P) {
1290
  __vector unsigned long long __a = (__vector unsigned long long)__A;
1291
  __vector unsigned long long __p = vec_splats(*__P);
1292
  __a[0] = __p[0];
1293
 
1294
  return (__m128)__a;
1295
}
1296
 
1297
/* Stores the lower two SPFP values of A into P.  */
1298
extern __inline void
1299
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1300
    _mm_storel_pi(__m64 *__P, __m128 __A) {
1301
  __vector unsigned long long __a = (__vector unsigned long long)__A;
1302
 
1303
  *__P = __a[0];
1304
}
1305
 
1306
#ifdef _ARCH_PWR8
1307
/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1308
 
1309
/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1310
extern __inline int
1311
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1312
    _mm_movemask_ps(__m128 __A) {
1313
#ifdef _ARCH_PWR10
1314
  return vec_extractm((__vector unsigned int)__A);
1315
#else
1316
  __vector unsigned long long __result;
1317
  static const __vector unsigned int __perm_mask = {
1318
#ifdef __LITTLE_ENDIAN__
1319
      0x00204060, 0x80808080, 0x80808080, 0x80808080
1320
#else
1321
      0x80808080, 0x80808080, 0x80808080, 0x00204060
1322
#endif
1323
  };
1324
 
1325
  __result = ((__vector unsigned long long)vec_vbpermq(
1326
      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1327
 
1328
#ifdef __LITTLE_ENDIAN__
1329
  return __result[1];
1330
#else
1331
  return __result[0];
1332
#endif
1333
#endif /* !_ARCH_PWR10 */
1334
}
1335
#endif /* _ARCH_PWR8 */
1336
 
1337
/* Create a vector with all four elements equal to *P.  */
1338
extern __inline __m128
1339
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1340
    _mm_load1_ps(float const *__P) {
1341
  return _mm_set1_ps(*__P);
1342
}
1343
 
1344
extern __inline __m128
1345
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1346
    _mm_load_ps1(float const *__P) {
1347
  return _mm_load1_ps(__P);
1348
}
1349
 
1350
/* Extracts one of the four words of A.  The selector N must be immediate.  */
1351
extern __inline int
1352
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353
    _mm_extract_pi16(__m64 const __A, int const __N) {
1354
  unsigned int __shiftr = __N & 3;
1355
#ifdef __BIG_ENDIAN__
1356
  __shiftr = 3 - __shiftr;
1357
#endif
1358
 
1359
  return ((__A >> (__shiftr * 16)) & 0xffff);
1360
}
1361
 
1362
extern __inline int
1363
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364
    _m_pextrw(__m64 const __A, int const __N) {
1365
  return _mm_extract_pi16(__A, __N);
1366
}
1367
 
1368
/* Inserts word D into one of four words of A.  The selector N must be
1369
   immediate.  */
1370
extern __inline __m64
1371
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372
    _mm_insert_pi16(__m64 const __A, int const __D, int const __N) {
1373
  const int __shiftl = (__N & 3) * 16;
1374
  const __m64 __shiftD = (const __m64)__D << __shiftl;
1375
  const __m64 __mask = 0xffffUL << __shiftl;
1376
  __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1377
 
1378
  return __result;
1379
}
1380
 
1381
extern __inline __m64
1382
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1383
    _m_pinsrw(__m64 const __A, int const __D, int const __N) {
1384
  return _mm_insert_pi16(__A, __D, __N);
1385
}
1386
 
1387
/* Compute the element-wise maximum of signed 16-bit values.  */
1388
extern __inline __m64
1389
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390
 
1391
    _mm_max_pi16(__m64 __A, __m64 __B) {
1392
#if _ARCH_PWR8
1393
  __vector signed short __a, __b, __r;
1394
  __vector __bool short __c;
1395
 
1396
  __a = (__vector signed short)vec_splats(__A);
1397
  __b = (__vector signed short)vec_splats(__B);
1398
  __c = (__vector __bool short)vec_cmpgt(__a, __b);
1399
  __r = vec_sel(__b, __a, __c);
1400
  return (__m64)((__vector long long)__r)[0];
1401
#else
1402
  __m64_union __m1, __m2, __res;
1403
 
1404
  __m1.as_m64 = __A;
1405
  __m2.as_m64 = __B;
1406
 
1407
  __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0]
1408
                                                            : __m2.as_short[0];
1409
  __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1]
1410
                                                            : __m2.as_short[1];
1411
  __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2]
1412
                                                            : __m2.as_short[2];
1413
  __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3]
1414
                                                            : __m2.as_short[3];
1415
 
1416
  return (__m64)__res.as_m64;
1417
#endif
1418
}
1419
 
1420
extern __inline __m64
1421
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1422
    _m_pmaxsw(__m64 __A, __m64 __B) {
1423
  return _mm_max_pi16(__A, __B);
1424
}
1425
 
1426
/* Compute the element-wise maximum of unsigned 8-bit values.  */
1427
extern __inline __m64
1428
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429
    _mm_max_pu8(__m64 __A, __m64 __B) {
1430
#if _ARCH_PWR8
1431
  __vector unsigned char __a, __b, __r;
1432
  __vector __bool char __c;
1433
 
1434
  __a = (__vector unsigned char)vec_splats(__A);
1435
  __b = (__vector unsigned char)vec_splats(__B);
1436
  __c = (__vector __bool char)vec_cmpgt(__a, __b);
1437
  __r = vec_sel(__b, __a, __c);
1438
  return (__m64)((__vector long long)__r)[0];
1439
#else
1440
  __m64_union __m1, __m2, __res;
1441
  long __i;
1442
 
1443
  __m1.as_m64 = __A;
1444
  __m2.as_m64 = __B;
1445
 
1446
  for (__i = 0; __i < 8; __i++)
1447
    __res.as_char[__i] =
1448
        ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i])
1449
            ? __m1.as_char[__i]
1450
            : __m2.as_char[__i];
1451
 
1452
  return (__m64)__res.as_m64;
1453
#endif
1454
}
1455
 
1456
extern __inline __m64
1457
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1458
    _m_pmaxub(__m64 __A, __m64 __B) {
1459
  return _mm_max_pu8(__A, __B);
1460
}
1461
 
1462
/* Compute the element-wise minimum of signed 16-bit values.  */
1463
extern __inline __m64
1464
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1465
    _mm_min_pi16(__m64 __A, __m64 __B) {
1466
#if _ARCH_PWR8
1467
  __vector signed short __a, __b, __r;
1468
  __vector __bool short __c;
1469
 
1470
  __a = (__vector signed short)vec_splats(__A);
1471
  __b = (__vector signed short)vec_splats(__B);
1472
  __c = (__vector __bool short)vec_cmplt(__a, __b);
1473
  __r = vec_sel(__b, __a, __c);
1474
  return (__m64)((__vector long long)__r)[0];
1475
#else
1476
  __m64_union __m1, __m2, __res;
1477
 
1478
  __m1.as_m64 = __A;
1479
  __m2.as_m64 = __B;
1480
 
1481
  __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0]
1482
                                                            : __m2.as_short[0];
1483
  __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1]
1484
                                                            : __m2.as_short[1];
1485
  __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2]
1486
                                                            : __m2.as_short[2];
1487
  __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3]
1488
                                                            : __m2.as_short[3];
1489
 
1490
  return (__m64)__res.as_m64;
1491
#endif
1492
}
1493
 
1494
extern __inline __m64
1495
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1496
    _m_pminsw(__m64 __A, __m64 __B) {
1497
  return _mm_min_pi16(__A, __B);
1498
}
1499
 
1500
/* Compute the element-wise minimum of unsigned 8-bit values.  */
1501
extern __inline __m64
1502
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1503
    _mm_min_pu8(__m64 __A, __m64 __B) {
1504
#if _ARCH_PWR8
1505
  __vector unsigned char __a, __b, __r;
1506
  __vector __bool char __c;
1507
 
1508
  __a = (__vector unsigned char)vec_splats(__A);
1509
  __b = (__vector unsigned char)vec_splats(__B);
1510
  __c = (__vector __bool char)vec_cmplt(__a, __b);
1511
  __r = vec_sel(__b, __a, __c);
1512
  return (__m64)((__vector long long)__r)[0];
1513
#else
1514
  __m64_union __m1, __m2, __res;
1515
  long __i;
1516
 
1517
  __m1.as_m64 = __A;
1518
  __m2.as_m64 = __B;
1519
 
1520
  for (__i = 0; __i < 8; __i++)
1521
    __res.as_char[__i] =
1522
        ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i])
1523
            ? __m1.as_char[__i]
1524
            : __m2.as_char[__i];
1525
 
1526
  return (__m64)__res.as_m64;
1527
#endif
1528
}
1529
 
1530
extern __inline __m64
1531
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1532
    _m_pminub(__m64 __A, __m64 __B) {
1533
  return _mm_min_pu8(__A, __B);
1534
}
1535
 
1536
/* Create an 8-bit mask of the signs of 8-bit values.  */
1537
extern __inline int
1538
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1539
    _mm_movemask_pi8(__m64 __A) {
1540
#ifdef __powerpc64__
1541
  unsigned long long __p =
1542
#ifdef __LITTLE_ENDIAN__
1543
      0x0008101820283038UL; // permute control for sign bits
1544
#else
1545
      0x3830282018100800UL; // permute control for sign bits
1546
#endif
1547
  return __builtin_bpermd(__p, __A);
1548
#else
1549
#ifdef __LITTLE_ENDIAN__
1550
  unsigned int __mask = 0x20283038UL;
1551
  unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf;
1552
  unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1553
#else
1554
  unsigned int __mask = 0x38302820UL;
1555
  unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf;
1556
  unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf;
1557
#endif
1558
  return (__r2 << 4) | __r1;
1559
#endif
1560
}
1561
 
1562
extern __inline int
1563
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1564
    _m_pmovmskb(__m64 __A) {
1565
  return _mm_movemask_pi8(__A);
1566
}
1567
 
1568
/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1569
   in B and produce the high 16 bits of the 32-bit results.  */
1570
extern __inline __m64
1571
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1572
    _mm_mulhi_pu16(__m64 __A, __m64 __B) {
1573
  __vector unsigned short __a, __b;
1574
  __vector unsigned short __c;
1575
  __vector unsigned int __w0, __w1;
1576
  __vector unsigned char __xform1 = {
1577
#ifdef __LITTLE_ENDIAN__
1578
      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1579
      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1580
#else
1581
      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1582
      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1583
#endif
1584
  };
1585
 
1586
  __a = (__vector unsigned short)vec_splats(__A);
1587
  __b = (__vector unsigned short)vec_splats(__B);
1588
 
1589
  __w0 = vec_vmuleuh(__a, __b);
1590
  __w1 = vec_vmulouh(__a, __b);
1591
  __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1);
1592
 
1593
  return (__m64)((__vector long long)__c)[0];
1594
}
1595
 
1596
extern __inline __m64
1597
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1598
    _m_pmulhuw(__m64 __A, __m64 __B) {
1599
  return _mm_mulhi_pu16(__A, __B);
1600
}
1601
 
1602
/* Return a combination of the four 16-bit values in A.  The selector
1603
   must be an immediate.  */
1604
extern __inline __m64
1605
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606
    _mm_shuffle_pi16(__m64 __A, int const __N) {
1607
  unsigned long __element_selector_10 = __N & 0x03;
1608
  unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1609
  unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1610
  unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1611
  static const unsigned short __permute_selectors[4] = {
1612
#ifdef __LITTLE_ENDIAN__
1613
      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1614
#else
1615
      0x0607, 0x0405, 0x0203, 0x0001
1616
#endif
1617
  };
1618
  __m64_union __t;
1619
  __vector unsigned long long __a, __p, __r;
1620
 
1621
#ifdef __LITTLE_ENDIAN__
1622
  __t.as_short[0] = __permute_selectors[__element_selector_10];
1623
  __t.as_short[1] = __permute_selectors[__element_selector_32];
1624
  __t.as_short[2] = __permute_selectors[__element_selector_54];
1625
  __t.as_short[3] = __permute_selectors[__element_selector_76];
1626
#else
1627
  __t.as_short[3] = __permute_selectors[__element_selector_10];
1628
  __t.as_short[2] = __permute_selectors[__element_selector_32];
1629
  __t.as_short[1] = __permute_selectors[__element_selector_54];
1630
  __t.as_short[0] = __permute_selectors[__element_selector_76];
1631
#endif
1632
  __p = vec_splats(__t.as_m64);
1633
  __a = vec_splats(__A);
1634
  __r = vec_perm(__a, __a, (__vector unsigned char)__p);
1635
  return (__m64)((__vector long long)__r)[0];
1636
}
1637
 
1638
extern __inline __m64
1639
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1640
    _m_pshufw(__m64 __A, int const __N) {
1641
  return _mm_shuffle_pi16(__A, __N);
1642
}
1643
 
1644
/* Conditionally store byte elements of A into P.  The high bit of each
1645
   byte in the selector N determines whether the corresponding byte from
1646
   A is stored.  */
1647
extern __inline void
1648
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1649
    _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) {
1650
  __m64 __hibit = 0x8080808080808080UL;
1651
  __m64 __mask, __tmp;
1652
  __m64 *__p = (__m64 *)__P;
1653
 
1654
  __tmp = *__p;
1655
  __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit);
1656
  __tmp = (__tmp & (~__mask)) | (__A & __mask);
1657
  *__p = __tmp;
1658
}
1659
 
1660
extern __inline void
1661
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1662
    _m_maskmovq(__m64 __A, __m64 __N, char *__P) {
1663
  _mm_maskmove_si64(__A, __N, __P);
1664
}
1665
 
1666
/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1667
extern __inline __m64
1668
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1669
    _mm_avg_pu8(__m64 __A, __m64 __B) {
1670
  __vector unsigned char __a, __b, __c;
1671
 
1672
  __a = (__vector unsigned char)vec_splats(__A);
1673
  __b = (__vector unsigned char)vec_splats(__B);
1674
  __c = vec_avg(__a, __b);
1675
  return (__m64)((__vector long long)__c)[0];
1676
}
1677
 
1678
extern __inline __m64
1679
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1680
    _m_pavgb(__m64 __A, __m64 __B) {
1681
  return _mm_avg_pu8(__A, __B);
1682
}
1683
 
1684
/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1685
extern __inline __m64
1686
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1687
    _mm_avg_pu16(__m64 __A, __m64 __B) {
1688
  __vector unsigned short __a, __b, __c;
1689
 
1690
  __a = (__vector unsigned short)vec_splats(__A);
1691
  __b = (__vector unsigned short)vec_splats(__B);
1692
  __c = vec_avg(__a, __b);
1693
  return (__m64)((__vector long long)__c)[0];
1694
}
1695
 
1696
extern __inline __m64
1697
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1698
    _m_pavgw(__m64 __A, __m64 __B) {
1699
  return _mm_avg_pu16(__A, __B);
1700
}
1701
 
1702
/* Compute the sum of the absolute differences of the unsigned 8-bit
1703
   values in A and B.  Return the value in the lower 16-bit word; the
1704
   upper words are cleared.  */
1705
extern __inline __m64
1706
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1707
    _mm_sad_pu8(__m64 __A, __m64 __B) {
1708
  __vector unsigned char __a, __b;
1709
  __vector unsigned char __vmin, __vmax, __vabsdiff;
1710
  __vector signed int __vsum;
1711
  const __vector unsigned int __zero = {0, 0, 0, 0};
1712
  __m64_union __result = {0};
1713
 
1714
  __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A};
1715
  __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B};
1716
  __vmin = vec_min(__a, __b);
1717
  __vmax = vec_max(__a, __b);
1718
  __vabsdiff = vec_sub(__vmax, __vmin);
1719
  /* Sum four groups of bytes into integers.  */
1720
  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
1721
  /* Sum across four integers with integer result.  */
1722
  __vsum = vec_sums(__vsum, (__vector signed int)__zero);
1723
  /* The sum is in the right most 32-bits of the vector result.
1724
     Transfer to a GPR and truncate to 16 bits.  */
1725
  __result.as_short[0] = __vsum[3];
1726
  return __result.as_m64;
1727
}
1728
 
1729
extern __inline __m64
1730
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1731
    _m_psadbw(__m64 __A, __m64 __B) {
1732
  return _mm_sad_pu8(__A, __B);
1733
}
1734
 
1735
/* Stores the data in A to the address P without polluting the caches.  */
1736
extern __inline void
1737
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1738
    _mm_stream_pi(__m64 *__P, __m64 __A) {
1739
  /* Use the data cache block touch for store transient.  */
1740
  __asm__("     dcbtstt 0,%0" : : "b"(__P) : "memory");
1741
  *__P = __A;
1742
}
1743
 
1744
/* Likewise.  The address must be 16-byte aligned.  */
1745
extern __inline void
1746
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1747
    _mm_stream_ps(float *__P, __m128 __A) {
1748
  /* Use the data cache block touch for store transient.  */
1749
  __asm__("     dcbtstt 0,%0" : : "b"(__P) : "memory");
1750
  _mm_store_ps(__P, __A);
1751
}
1752
 
1753
/* Guarantees that every preceding store is globally visible before
1754
   any subsequent store.  */
1755
extern __inline void
1756
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757
    _mm_sfence(void) {
1758
  /* Generate a light weight sync.  */
1759
  __atomic_thread_fence(__ATOMIC_RELEASE);
1760
}
1761
 
1762
/* The execution of the next instruction is delayed by an implementation
1763
   specific amount of time.  The instruction does not modify the
1764
   architectural state.  This is after the pop_options pragma because
1765
   it does not require SSE support in the processor--the encoding is a
1766
   nop on processors that do not support it.  */
1767
extern __inline void
1768
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1769
    _mm_pause(void) {
1770
  /* There is no exact match with this construct, but the following is
1771
     close to the desired effect.  */
1772
#if _ARCH_PWR8
1773
  /* On power8 and later processors we can depend on Program Priority
1774
     (PRI) and associated "very low" PPI setting.  Since we don't know
1775
     what PPI this thread is running at we: 1) save the current PRI
1776
     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1777
     via the special or 31,31,31 encoding. 3) issue an "isync" to
1778
     insure the PRI change takes effect before we execute any more
1779
     instructions.
1780
     Now we can execute a lwsync (release barrier) while we execute
1781
     this thread at "very low" PRI.  Finally we restore the original
1782
     PRI and continue execution.  */
1783
  unsigned long __PPR;
1784
 
1785
  __asm__ volatile("    mfppr   %0;"
1786
                   "   or 31,31,31;"
1787
                   "   isync;"
1788
                   "   lwsync;"
1789
                   "   isync;"
1790
                   "   mtppr    %0;"
1791
                   : "=r"(__PPR)
1792
                   :
1793
                   : "memory");
1794
#else
1795
  /* For older processor where we may not even have Program Priority
1796
     controls we can only depend on Heavy Weight Sync.  */
1797
  __atomic_thread_fence(__ATOMIC_SEQ_CST);
1798
#endif
1799
}
1800
 
1801
/* Transpose the 4x4 matrix composed of row[0-3].  */
1802
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                              \
1803
  do {                                                                         \
1804
    __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);         \
1805
    __v4sf __t0 = vec_vmrghw(__r0, __r1);                                      \
1806
    __v4sf __t1 = vec_vmrghw(__r2, __r3);                                      \
1807
    __v4sf __t2 = vec_vmrglw(__r0, __r1);                                      \
1808
    __v4sf __t3 = vec_vmrglw(__r2, __r3);                                      \
1809
    (row0) = (__v4sf)vec_mergeh((__vector long long)__t0,                      \
1810
                                (__vector long long)__t1);                     \
1811
    (row1) = (__v4sf)vec_mergel((__vector long long)__t0,                      \
1812
                                (__vector long long)__t1);                     \
1813
    (row2) = (__v4sf)vec_mergeh((__vector long long)__t2,                      \
1814
                                (__vector long long)__t3);                     \
1815
    (row3) = (__v4sf)vec_mergel((__vector long long)__t2,                      \
1816
                                (__vector long long)__t3);                     \
1817
  } while (0)
1818
 
1819
/* For backward source compatibility.  */
1820
//# include <emmintrin.h>
1821
 
1822
#else
1823
#include_next <xmmintrin.h>
1824
#endif /* defined(__powerpc64__) &&                                            \
1825
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1826
 
1827
#endif /* XMMINTRIN_H_ */