Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
/* Implemented from the specification included in the Intel C++ Compiler
11
   User Guide and Reference, version 9.0.
12
 
13
   NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
14
 
15
#ifndef NO_WARN_X86_INTRINSICS
16
/* This header is distributed to simplify porting x86_64 code that
17
   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
18
 
19
   It is the user's responsibility to determine if the results are
20
   acceptable and make additional changes as necessary.
21
 
22
   Note that much code that uses Intel intrinsics can be rewritten in
23
   standard C or GNU C extensions, which are more portable and better
24
   optimized across multiple targets.  */
25
#error                                                                         \
26
    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
27
#endif
28
 
29
#ifndef SMMINTRIN_H_
30
#define SMMINTRIN_H_
31
 
32
#if defined(__powerpc64__) &&                                                  \
33
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34
 
35
#include <altivec.h>
36
#include <tmmintrin.h>
37
 
38
/* Rounding mode macros. */
39
#define _MM_FROUND_TO_NEAREST_INT 0x00
40
#define _MM_FROUND_TO_ZERO 0x01
41
#define _MM_FROUND_TO_POS_INF 0x02
42
#define _MM_FROUND_TO_NEG_INF 0x03
43
#define _MM_FROUND_CUR_DIRECTION 0x04
44
 
45
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51
 
52
#define _MM_FROUND_RAISE_EXC 0x00
53
#define _MM_FROUND_NO_EXC 0x08
54
 
55
extern __inline __m128d
56
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57
    _mm_round_pd(__m128d __A, int __rounding) {
58
  __v2df __r;
59
  union {
60
    double __fr;
61
    long long __fpscr;
62
  } __enables_save, __fpscr_save;
63
 
64
  if (__rounding & _MM_FROUND_NO_EXC) {
65
    /* Save enabled exceptions, disable all exceptions,
66
       and preserve the rounding mode.  */
67
#ifdef _ARCH_PWR9
68
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70
#else
71
    __fpscr_save.__fr = __builtin_mffs();
72
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73
    __fpscr_save.__fpscr &= ~0xf8;
74
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
75
#endif
76
    /* Insert an artificial "read/write" reference to the variable
77
       read below, to ensure the compiler does not schedule
78
       a read/use of the variable before the FPSCR is modified, above.
79
       This can be removed if and when GCC PR102783 is fixed.
80
     */
81
    __asm__("" : "+wa"(__A));
82
  }
83
 
84
  switch (__rounding) {
85
  case _MM_FROUND_TO_NEAREST_INT:
86
    __fpscr_save.__fr = __builtin_mffsl();
87
    __attribute__((fallthrough));
88
  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
89
    __builtin_set_fpscr_rn(0b00);
90
    /* Insert an artificial "read/write" reference to the variable
91
       read below, to ensure the compiler does not schedule
92
       a read/use of the variable before the FPSCR is modified, above.
93
       This can be removed if and when GCC PR102783 is fixed.
94
     */
95
    __asm__("" : "+wa"(__A));
96
 
97
    __r = vec_rint((__v2df)__A);
98
 
99
    /* Insert an artificial "read" reference to the variable written
100
       above, to ensure the compiler does not schedule the computation
101
       of the value after the manipulation of the FPSCR, below.
102
       This can be removed if and when GCC PR102783 is fixed.
103
     */
104
    __asm__("" : : "wa"(__r));
105
    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
106
    break;
107
  case _MM_FROUND_TO_NEG_INF:
108
  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
109
    __r = vec_floor((__v2df)__A);
110
    break;
111
  case _MM_FROUND_TO_POS_INF:
112
  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
113
    __r = vec_ceil((__v2df)__A);
114
    break;
115
  case _MM_FROUND_TO_ZERO:
116
  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
117
    __r = vec_trunc((__v2df)__A);
118
    break;
119
  case _MM_FROUND_CUR_DIRECTION:
120
    __r = vec_rint((__v2df)__A);
121
    break;
122
  }
123
  if (__rounding & _MM_FROUND_NO_EXC) {
124
    /* Insert an artificial "read" reference to the variable written
125
       above, to ensure the compiler does not schedule the computation
126
       of the value after the manipulation of the FPSCR, below.
127
       This can be removed if and when GCC PR102783 is fixed.
128
     */
129
    __asm__("" : : "wa"(__r));
130
    /* Restore enabled exceptions.  */
131
    __fpscr_save.__fr = __builtin_mffsl();
132
    __fpscr_save.__fpscr |= __enables_save.__fpscr;
133
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
134
  }
135
  return (__m128d)__r;
136
}
137
 
138
extern __inline __m128d
139
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140
    _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
141
  __B = _mm_round_pd(__B, __rounding);
142
  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
143
  return (__m128d)__r;
144
}
145
 
146
extern __inline __m128
147
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148
    _mm_round_ps(__m128 __A, int __rounding) {
149
  __v4sf __r;
150
  union {
151
    double __fr;
152
    long long __fpscr;
153
  } __enables_save, __fpscr_save;
154
 
155
  if (__rounding & _MM_FROUND_NO_EXC) {
156
    /* Save enabled exceptions, disable all exceptions,
157
       and preserve the rounding mode.  */
158
#ifdef _ARCH_PWR9
159
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
160
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
161
#else
162
    __fpscr_save.__fr = __builtin_mffs();
163
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
164
    __fpscr_save.__fpscr &= ~0xf8;
165
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
166
#endif
167
    /* Insert an artificial "read/write" reference to the variable
168
       read below, to ensure the compiler does not schedule
169
       a read/use of the variable before the FPSCR is modified, above.
170
       This can be removed if and when GCC PR102783 is fixed.
171
     */
172
    __asm__("" : "+wa"(__A));
173
  }
174
 
175
  switch (__rounding) {
176
  case _MM_FROUND_TO_NEAREST_INT:
177
    __fpscr_save.__fr = __builtin_mffsl();
178
    __attribute__((fallthrough));
179
  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
180
    __builtin_set_fpscr_rn(0b00);
181
    /* Insert an artificial "read/write" reference to the variable
182
       read below, to ensure the compiler does not schedule
183
       a read/use of the variable before the FPSCR is modified, above.
184
       This can be removed if and when GCC PR102783 is fixed.
185
     */
186
    __asm__("" : "+wa"(__A));
187
 
188
    __r = vec_rint((__v4sf)__A);
189
 
190
    /* Insert an artificial "read" reference to the variable written
191
       above, to ensure the compiler does not schedule the computation
192
       of the value after the manipulation of the FPSCR, below.
193
       This can be removed if and when GCC PR102783 is fixed.
194
     */
195
    __asm__("" : : "wa"(__r));
196
    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
197
    break;
198
  case _MM_FROUND_TO_NEG_INF:
199
  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
200
    __r = vec_floor((__v4sf)__A);
201
    break;
202
  case _MM_FROUND_TO_POS_INF:
203
  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
204
    __r = vec_ceil((__v4sf)__A);
205
    break;
206
  case _MM_FROUND_TO_ZERO:
207
  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
208
    __r = vec_trunc((__v4sf)__A);
209
    break;
210
  case _MM_FROUND_CUR_DIRECTION:
211
    __r = vec_rint((__v4sf)__A);
212
    break;
213
  }
214
  if (__rounding & _MM_FROUND_NO_EXC) {
215
    /* Insert an artificial "read" reference to the variable written
216
       above, to ensure the compiler does not schedule the computation
217
       of the value after the manipulation of the FPSCR, below.
218
       This can be removed if and when GCC PR102783 is fixed.
219
     */
220
    __asm__("" : : "wa"(__r));
221
    /* Restore enabled exceptions.  */
222
    __fpscr_save.__fr = __builtin_mffsl();
223
    __fpscr_save.__fpscr |= __enables_save.__fpscr;
224
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
225
  }
226
  return (__m128)__r;
227
}
228
 
229
extern __inline __m128
230
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231
    _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
232
  __B = _mm_round_ps(__B, __rounding);
233
  __v4sf __r = (__v4sf)__A;
234
  __r[0] = ((__v4sf)__B)[0];
235
  return (__m128)__r;
236
}
237
 
238
#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
239
#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
240
 
241
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
242
#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
243
 
244
#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
245
#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
246
 
247
#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
248
#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
249
 
250
extern __inline __m128i
251
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252
    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
253
  __v16qi __result = (__v16qi)__A;
254
 
255
  __result[__N & 0xf] = __D;
256
 
257
  return (__m128i)__result;
258
}
259
 
260
extern __inline __m128i
261
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262
    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
263
  __v4si __result = (__v4si)__A;
264
 
265
  __result[__N & 3] = __D;
266
 
267
  return (__m128i)__result;
268
}
269
 
270
extern __inline __m128i
271
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272
    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
273
  __v2di __result = (__v2di)__A;
274
 
275
  __result[__N & 1] = __D;
276
 
277
  return (__m128i)__result;
278
}
279
 
280
extern __inline int
281
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282
    _mm_extract_epi8(__m128i __X, const int __N) {
283
  return (unsigned char)((__v16qi)__X)[__N & 15];
284
}
285
 
286
extern __inline int
287
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288
    _mm_extract_epi32(__m128i __X, const int __N) {
289
  return ((__v4si)__X)[__N & 3];
290
}
291
 
292
extern __inline int
293
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294
    _mm_extract_epi64(__m128i __X, const int __N) {
295
  return ((__v2di)__X)[__N & 1];
296
}
297
 
298
extern __inline int
299
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300
    _mm_extract_ps(__m128 __X, const int __N) {
301
  return ((__v4si)__X)[__N & 3];
302
}
303
 
304
#ifdef _ARCH_PWR8
305
extern __inline __m128i
306
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307
    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
308
  __v16qi __charmask = vec_splats((signed char)__imm8);
309
  __charmask = vec_gb(__charmask);
310
  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
311
#ifdef __BIG_ENDIAN__
312
  __shortmask = vec_reve(__shortmask);
313
#endif
314
  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
315
}
316
#endif
317
 
318
extern __inline __m128i
319
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320
    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
321
#ifdef _ARCH_PWR10
322
  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
323
#else
324
  const __v16qu __seven = vec_splats((unsigned char)0x07);
325
  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
326
  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
327
#endif
328
}
329
 
330
extern __inline __m128
331
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332
    _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
333
  __v16qu __pcv[] = {
334
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
335
      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
336
      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
337
      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
338
      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
339
      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
340
      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
341
      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
342
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
343
      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
344
      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
345
      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
346
      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
347
      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
348
      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
349
      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
350
  };
351
  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
352
  return (__m128)__r;
353
}
354
 
355
extern __inline __m128
356
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357
    _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
358
#ifdef _ARCH_PWR10
359
  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
360
#else
361
  const __v4si __zero = {0};
362
  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
363
  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
364
#endif
365
}
366
 
367
extern __inline __m128d
368
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369
    _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
370
  __v16qu __pcv[] = {
371
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
372
      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
373
      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
374
      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
375
  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
376
  return (__m128d)__r;
377
}
378
 
379
#ifdef _ARCH_PWR8
380
extern __inline __m128d
381
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382
    _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
383
#ifdef _ARCH_PWR10
384
  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
385
#else
386
  const __v2di __zero = {0};
387
  const __vector __bool long long __boolmask =
388
      vec_cmplt((__v2di)__mask, __zero);
389
  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
390
#endif
391
}
392
#endif
393
 
394
extern __inline int
395
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396
    _mm_testz_si128(__m128i __A, __m128i __B) {
397
  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
398
  const __v16qu __zero = {0};
399
  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
400
}
401
 
402
extern __inline int
403
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404
    _mm_testc_si128(__m128i __A, __m128i __B) {
405
  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
406
  const __v16qu __zero = {0};
407
  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
408
  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
409
}
410
 
411
extern __inline int
412
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413
    _mm_testnzc_si128(__m128i __A, __m128i __B) {
414
  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
415
  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
416
}
417
 
418
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
419
 
420
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
421
 
422
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
423
 
424
#ifdef _ARCH_PWR8
425
extern __inline __m128i
426
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427
    _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
428
  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
429
}
430
#endif
431
 
432
extern __inline __m128i
433
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434
    _mm_min_epi8(__m128i __X, __m128i __Y) {
435
  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
436
}
437
 
438
extern __inline __m128i
439
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
440
    _mm_min_epu16(__m128i __X, __m128i __Y) {
441
  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
442
}
443
 
444
extern __inline __m128i
445
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446
    _mm_min_epi32(__m128i __X, __m128i __Y) {
447
  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
448
}
449
 
450
extern __inline __m128i
451
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452
    _mm_min_epu32(__m128i __X, __m128i __Y) {
453
  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
454
}
455
 
456
extern __inline __m128i
457
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458
    _mm_max_epi8(__m128i __X, __m128i __Y) {
459
  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
460
}
461
 
462
extern __inline __m128i
463
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
464
    _mm_max_epu16(__m128i __X, __m128i __Y) {
465
  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
466
}
467
 
468
extern __inline __m128i
469
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470
    _mm_max_epi32(__m128i __X, __m128i __Y) {
471
  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
472
}
473
 
474
extern __inline __m128i
475
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476
    _mm_max_epu32(__m128i __X, __m128i __Y) {
477
  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
478
}
479
 
480
extern __inline __m128i
481
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482
    _mm_mullo_epi32(__m128i __X, __m128i __Y) {
483
  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
484
}
485
 
486
#ifdef _ARCH_PWR8
487
extern __inline __m128i
488
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489
    _mm_mul_epi32(__m128i __X, __m128i __Y) {
490
  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
491
}
492
#endif
493
 
494
extern __inline __m128i
495
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496
    _mm_cvtepi8_epi16(__m128i __A) {
497
  return (__m128i)vec_unpackh((__v16qi)__A);
498
}
499
 
500
extern __inline __m128i
501
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502
    _mm_cvtepi8_epi32(__m128i __A) {
503
  __A = (__m128i)vec_unpackh((__v16qi)__A);
504
  return (__m128i)vec_unpackh((__v8hi)__A);
505
}
506
 
507
#ifdef _ARCH_PWR8
508
extern __inline __m128i
509
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510
    _mm_cvtepi8_epi64(__m128i __A) {
511
  __A = (__m128i)vec_unpackh((__v16qi)__A);
512
  __A = (__m128i)vec_unpackh((__v8hi)__A);
513
  return (__m128i)vec_unpackh((__v4si)__A);
514
}
515
#endif
516
 
517
extern __inline __m128i
518
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519
    _mm_cvtepi16_epi32(__m128i __A) {
520
  return (__m128i)vec_unpackh((__v8hi)__A);
521
}
522
 
523
#ifdef _ARCH_PWR8
524
extern __inline __m128i
525
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526
    _mm_cvtepi16_epi64(__m128i __A) {
527
  __A = (__m128i)vec_unpackh((__v8hi)__A);
528
  return (__m128i)vec_unpackh((__v4si)__A);
529
}
530
#endif
531
 
532
#ifdef _ARCH_PWR8
533
extern __inline __m128i
534
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535
    _mm_cvtepi32_epi64(__m128i __A) {
536
  return (__m128i)vec_unpackh((__v4si)__A);
537
}
538
#endif
539
 
540
extern __inline __m128i
541
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542
    _mm_cvtepu8_epi16(__m128i __A) {
543
  const __v16qu __zero = {0};
544
#ifdef __LITTLE_ENDIAN__
545
  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
546
#else  /* __BIG_ENDIAN__.  */
547
  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
548
#endif /* __BIG_ENDIAN__.  */
549
  return __A;
550
}
551
 
552
extern __inline __m128i
553
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554
    _mm_cvtepu8_epi32(__m128i __A) {
555
  const __v16qu __zero = {0};
556
#ifdef __LITTLE_ENDIAN__
557
  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
558
  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
559
#else  /* __BIG_ENDIAN__.  */
560
  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
561
  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
562
#endif /* __BIG_ENDIAN__.  */
563
  return __A;
564
}
565
 
566
extern __inline __m128i
567
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568
    _mm_cvtepu8_epi64(__m128i __A) {
569
  const __v16qu __zero = {0};
570
#ifdef __LITTLE_ENDIAN__
571
  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
572
  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
573
  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
574
#else  /* __BIG_ENDIAN__.  */
575
  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
576
  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
577
  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
578
#endif /* __BIG_ENDIAN__.  */
579
  return __A;
580
}
581
 
582
extern __inline __m128i
583
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584
    _mm_cvtepu16_epi32(__m128i __A) {
585
  const __v8hu __zero = {0};
586
#ifdef __LITTLE_ENDIAN__
587
  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
588
#else  /* __BIG_ENDIAN__.  */
589
  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
590
#endif /* __BIG_ENDIAN__.  */
591
  return __A;
592
}
593
 
594
extern __inline __m128i
595
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596
    _mm_cvtepu16_epi64(__m128i __A) {
597
  const __v8hu __zero = {0};
598
#ifdef __LITTLE_ENDIAN__
599
  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
600
  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
601
#else  /* __BIG_ENDIAN__.  */
602
  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
603
  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
604
#endif /* __BIG_ENDIAN__.  */
605
  return __A;
606
}
607
 
608
extern __inline __m128i
609
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610
    _mm_cvtepu32_epi64(__m128i __A) {
611
  const __v4su __zero = {0};
612
#ifdef __LITTLE_ENDIAN__
613
  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
614
#else  /* __BIG_ENDIAN__.  */
615
  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
616
#endif /* __BIG_ENDIAN__.  */
617
  return __A;
618
}
619
 
620
/* Return horizontal packed word minimum and its index in bits [15:0]
621
   and bits [18:16] respectively.  */
622
extern __inline __m128i
623
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624
    _mm_minpos_epu16(__m128i __A) {
625
  union __u {
626
    __m128i __m;
627
    __v8hu __uh;
628
  };
629
  union __u __u = {.__m = __A}, __r = {.__m = {0}};
630
  unsigned short __ridx = 0;
631
  unsigned short __rmin = __u.__uh[__ridx];
632
  unsigned long __i;
633
  for (__i = 1; __i < 8; __i++) {
634
    if (__u.__uh[__i] < __rmin) {
635
      __rmin = __u.__uh[__i];
636
      __ridx = __i;
637
    }
638
  }
639
  __r.__uh[0] = __rmin;
640
  __r.__uh[1] = __ridx;
641
  return __r.__m;
642
}
643
 
644
extern __inline __m128i
645
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646
    _mm_packus_epi32(__m128i __X, __m128i __Y) {
647
  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
648
}
649
 
650
#ifdef _ARCH_PWR8
651
extern __inline __m128i
652
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653
    _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
654
  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
655
}
656
#endif
657
 
658
#else
659
#include_next <smmintrin.h>
660
#endif /* defined(__powerpc64__) &&                                            \
661
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
662
 
663
#endif /* SMMINTRIN_H_ */