Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
/* Implemented from the specification included in the Intel C++ Compiler
11
   User Guide and Reference, version 9.0.  */
12
 
13
#ifndef NO_WARN_X86_INTRINSICS
14
/* This header is distributed to simplify porting x86_64 code that
15
   makes explicit use of Intel intrinsics to powerpc64le.
16
 
17
   It is the user's responsibility to determine if the results are
18
   acceptable and make additional changes as necessary.
19
 
20
   Note that much code that uses Intel intrinsics can be rewritten in
21
   standard C or GNU C extensions, which are more portable and better
22
   optimized across multiple targets.  */
23
#endif
24
 
25
#ifndef TMMINTRIN_H_
26
#define TMMINTRIN_H_
27
 
28
#if defined(__powerpc64__) &&                                                  \
29
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
30
 
31
#include <altivec.h>
32
 
33
/* We need definitions from the SSE header files.  */
34
#include <pmmintrin.h>
35
 
36
extern __inline __m128i
37
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
38
    _mm_abs_epi16(__m128i __A) {
39
  return (__m128i)vec_abs((__v8hi)__A);
40
}
41
 
42
extern __inline __m128i
43
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44
    _mm_abs_epi32(__m128i __A) {
45
  return (__m128i)vec_abs((__v4si)__A);
46
}
47
 
48
extern __inline __m128i
49
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
50
    _mm_abs_epi8(__m128i __A) {
51
  return (__m128i)vec_abs((__v16qi)__A);
52
}
53
 
54
extern __inline __m64
55
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56
    _mm_abs_pi16(__m64 __A) {
57
  __v8hi __B = (__v8hi)(__v2du){__A, __A};
58
  return (__m64)((__v2du)vec_abs(__B))[0];
59
}
60
 
61
extern __inline __m64
62
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63
    _mm_abs_pi32(__m64 __A) {
64
  __v4si __B = (__v4si)(__v2du){__A, __A};
65
  return (__m64)((__v2du)vec_abs(__B))[0];
66
}
67
 
68
extern __inline __m64
69
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70
    _mm_abs_pi8(__m64 __A) {
71
  __v16qi __B = (__v16qi)(__v2du){__A, __A};
72
  return (__m64)((__v2du)vec_abs(__B))[0];
73
}
74
 
75
extern __inline __m128i
76
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77
    _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
78
  if (__builtin_constant_p(__count) && __count < 16) {
79
#ifdef __LITTLE_ENDIAN__
80
    __A = (__m128i)vec_reve((__v16qu)__A);
81
    __B = (__m128i)vec_reve((__v16qu)__B);
82
#endif
83
    __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
84
#ifdef __LITTLE_ENDIAN__
85
    __A = (__m128i)vec_reve((__v16qu)__A);
86
#endif
87
    return __A;
88
  }
89
 
90
  if (__count == 0)
91
    return __B;
92
 
93
  if (__count >= 16) {
94
    if (__count >= 32) {
95
      const __v16qu __zero = {0};
96
      return (__m128i)__zero;
97
    } else {
98
      const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
99
#ifdef __LITTLE_ENDIAN__
100
      return (__m128i)vec_sro((__v16qu)__A, __shift);
101
#else
102
      return (__m128i)vec_slo((__v16qu)__A, __shift);
103
#endif
104
    }
105
  } else {
106
    const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
107
    const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
108
#ifdef __LITTLE_ENDIAN__
109
    __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
110
    __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
111
#else
112
    __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
113
    __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
114
#endif
115
    return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
116
  }
117
}
118
 
119
extern __inline __m64
120
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
121
    _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
122
  if (__count < 16) {
123
    __v2du __C = {__B, __A};
124
#ifdef __LITTLE_ENDIAN__
125
    const __v4su __shift = {__count << 3, 0, 0, 0};
126
    __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
127
#else
128
    const __v4su __shift = {0, 0, 0, __count << 3};
129
    __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
130
#endif
131
    return (__m64)__C[0];
132
  } else {
133
    const __m64 __zero = {0};
134
    return __zero;
135
  }
136
}
137
 
138
extern __inline __m128i
139
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140
    _mm_hadd_epi16(__m128i __A, __m128i __B) {
141
  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
142
                       16, 17, 20, 21, 24, 25, 28, 29};
143
  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
144
                       18, 19, 22, 23, 26, 27, 30, 31};
145
  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
146
  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
147
  return (__m128i)vec_add(__C, __D);
148
}
149
 
150
extern __inline __m128i
151
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152
    _mm_hadd_epi32(__m128i __A, __m128i __B) {
153
  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
154
                       16, 17, 18, 19, 24, 25, 26, 27};
155
  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
156
                       20, 21, 22, 23, 28, 29, 30, 31};
157
  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
158
  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
159
  return (__m128i)vec_add(__C, __D);
160
}
161
 
162
extern __inline __m64
163
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164
    _mm_hadd_pi16(__m64 __A, __m64 __B) {
165
  __v8hi __C = (__v8hi)(__v2du){__A, __B};
166
  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
167
  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
168
  __v8hi __D = vec_perm(__C, __C, __Q);
169
  __C = vec_perm(__C, __C, __P);
170
  __C = vec_add(__C, __D);
171
  return (__m64)((__v2du)__C)[1];
172
}
173
 
174
extern __inline __m64
175
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176
    _mm_hadd_pi32(__m64 __A, __m64 __B) {
177
  __v4si __C = (__v4si)(__v2du){__A, __B};
178
  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
179
  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
180
  __v4si __D = vec_perm(__C, __C, __Q);
181
  __C = vec_perm(__C, __C, __P);
182
  __C = vec_add(__C, __D);
183
  return (__m64)((__v2du)__C)[1];
184
}
185
 
186
extern __inline __m128i
187
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188
    _mm_hadds_epi16(__m128i __A, __m128i __B) {
189
  __v4si __C = {0}, __D = {0};
190
  __C = vec_sum4s((__v8hi)__A, __C);
191
  __D = vec_sum4s((__v8hi)__B, __D);
192
  __C = (__v4si)vec_packs(__C, __D);
193
  return (__m128i)__C;
194
}
195
 
196
extern __inline __m64
197
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198
    _mm_hadds_pi16(__m64 __A, __m64 __B) {
199
  const __v4si __zero = {0};
200
  __v8hi __C = (__v8hi)(__v2du){__A, __B};
201
  __v4si __D = vec_sum4s(__C, __zero);
202
  __C = vec_packs(__D, __D);
203
  return (__m64)((__v2du)__C)[1];
204
}
205
 
206
extern __inline __m128i
207
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208
    _mm_hsub_epi16(__m128i __A, __m128i __B) {
209
  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
210
                       16, 17, 20, 21, 24, 25, 28, 29};
211
  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
212
                       18, 19, 22, 23, 26, 27, 30, 31};
213
  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
214
  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
215
  return (__m128i)vec_sub(__C, __D);
216
}
217
 
218
extern __inline __m128i
219
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220
    _mm_hsub_epi32(__m128i __A, __m128i __B) {
221
  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
222
                       16, 17, 18, 19, 24, 25, 26, 27};
223
  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
224
                       20, 21, 22, 23, 28, 29, 30, 31};
225
  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
226
  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
227
  return (__m128i)vec_sub(__C, __D);
228
}
229
 
230
extern __inline __m64
231
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232
    _mm_hsub_pi16(__m64 __A, __m64 __B) {
233
  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
234
  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
235
  __v8hi __C = (__v8hi)(__v2du){__A, __B};
236
  __v8hi __D = vec_perm(__C, __C, __Q);
237
  __C = vec_perm(__C, __C, __P);
238
  __C = vec_sub(__C, __D);
239
  return (__m64)((__v2du)__C)[1];
240
}
241
 
242
extern __inline __m64
243
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244
    _mm_hsub_pi32(__m64 __A, __m64 __B) {
245
  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
246
  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
247
  __v4si __C = (__v4si)(__v2du){__A, __B};
248
  __v4si __D = vec_perm(__C, __C, __Q);
249
  __C = vec_perm(__C, __C, __P);
250
  __C = vec_sub(__C, __D);
251
  return (__m64)((__v2du)__C)[1];
252
}
253
 
254
extern __inline __m128i
255
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256
    _mm_hsubs_epi16(__m128i __A, __m128i __B) {
257
  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
258
                       16, 17, 20, 21, 24, 25, 28, 29};
259
  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
260
                       18, 19, 22, 23, 26, 27, 30, 31};
261
  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
262
  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
263
  return (__m128i)vec_subs(__C, __D);
264
}
265
 
266
extern __inline __m64
267
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268
    _mm_hsubs_pi16(__m64 __A, __m64 __B) {
269
  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
270
  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
271
  __v8hi __C = (__v8hi)(__v2du){__A, __B};
272
  __v8hi __D = vec_perm(__C, __C, __P);
273
  __v8hi __E = vec_perm(__C, __C, __Q);
274
  __C = vec_subs(__D, __E);
275
  return (__m64)((__v2du)__C)[1];
276
}
277
 
278
extern __inline __m128i
279
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
280
    _mm_shuffle_epi8(__m128i __A, __m128i __B) {
281
  const __v16qi __zero = {0};
282
  __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
283
  __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
284
  return (__m128i)vec_sel(__C, __zero, __select);
285
}
286
 
287
extern __inline __m64
288
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289
    _mm_shuffle_pi8(__m64 __A, __m64 __B) {
290
  const __v16qi __zero = {0};
291
  __v16qi __C = (__v16qi)(__v2du){__A, __A};
292
  __v16qi __D = (__v16qi)(__v2du){__B, __B};
293
  __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
294
  __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
295
  __C = vec_sel(__C, __zero, __select);
296
  return (__m64)((__v2du)(__C))[0];
297
}
298
 
299
#ifdef _ARCH_PWR8
300
extern __inline __m128i
301
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302
    _mm_sign_epi8(__m128i __A, __m128i __B) {
303
  const __v16qi __zero = {0};
304
  __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
305
  __v16qi __selectpos =
306
      (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
307
  __v16qi __conv = vec_add(__selectneg, __selectpos);
308
  return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
309
}
310
#endif
311
 
312
#ifdef _ARCH_PWR8
313
extern __inline __m128i
314
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315
    _mm_sign_epi16(__m128i __A, __m128i __B) {
316
  const __v8hi __zero = {0};
317
  __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
318
  __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
319
  __v8hi __conv = vec_add(__selectneg, __selectpos);
320
  return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
321
}
322
#endif
323
 
324
#ifdef _ARCH_PWR8
325
extern __inline __m128i
326
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327
    _mm_sign_epi32(__m128i __A, __m128i __B) {
328
  const __v4si __zero = {0};
329
  __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
330
  __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
331
  __v4si __conv = vec_add(__selectneg, __selectpos);
332
  return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
333
}
334
#endif
335
 
336
#ifdef _ARCH_PWR8
337
extern __inline __m64
338
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339
    _mm_sign_pi8(__m64 __A, __m64 __B) {
340
  const __v16qi __zero = {0};
341
  __v16qi __C = (__v16qi)(__v2du){__A, __A};
342
  __v16qi __D = (__v16qi)(__v2du){__B, __B};
343
  __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
344
  return (__m64)((__v2du)(__C))[0];
345
}
346
#endif
347
 
348
#ifdef _ARCH_PWR8
349
extern __inline __m64
350
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351
    _mm_sign_pi16(__m64 __A, __m64 __B) {
352
  const __v8hi __zero = {0};
353
  __v8hi __C = (__v8hi)(__v2du){__A, __A};
354
  __v8hi __D = (__v8hi)(__v2du){__B, __B};
355
  __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
356
  return (__m64)((__v2du)(__C))[0];
357
}
358
#endif
359
 
360
#ifdef _ARCH_PWR8
361
extern __inline __m64
362
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363
    _mm_sign_pi32(__m64 __A, __m64 __B) {
364
  const __v4si __zero = {0};
365
  __v4si __C = (__v4si)(__v2du){__A, __A};
366
  __v4si __D = (__v4si)(__v2du){__B, __B};
367
  __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
368
  return (__m64)((__v2du)(__C))[0];
369
}
370
#endif
371
 
372
extern __inline __m128i
373
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374
    _mm_maddubs_epi16(__m128i __A, __m128i __B) {
375
  __v8hi __unsigned = vec_splats((signed short)0x00ff);
376
  __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
377
  __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
378
  __v8hi __E = vec_unpackh((__v16qi)__B);
379
  __v8hi __F = vec_unpackl((__v16qi)__B);
380
  __C = vec_mul(__C, __E);
381
  __D = vec_mul(__D, __F);
382
  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
383
                          16, 17, 20, 21, 24, 25, 28, 29};
384
  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
385
                           18, 19, 22, 23, 26, 27, 30, 31};
386
  __E = vec_perm(__C, __D, __odds);
387
  __F = vec_perm(__C, __D, __evens);
388
  return (__m128i)vec_adds(__E, __F);
389
}
390
 
391
extern __inline __m64
392
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393
    _mm_maddubs_pi16(__m64 __A, __m64 __B) {
394
  __v8hi __C = (__v8hi)(__v2du){__A, __A};
395
  __C = vec_unpackl((__v16qi)__C);
396
  const __v8hi __unsigned = vec_splats((signed short)0x00ff);
397
  __C = vec_and(__C, __unsigned);
398
  __v8hi __D = (__v8hi)(__v2du){__B, __B};
399
  __D = vec_unpackl((__v16qi)__D);
400
  __D = vec_mul(__C, __D);
401
  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
402
                          16, 17, 20, 21, 24, 25, 28, 29};
403
  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
404
                           18, 19, 22, 23, 26, 27, 30, 31};
405
  __C = vec_perm(__D, __D, __odds);
406
  __D = vec_perm(__D, __D, __evens);
407
  __C = vec_adds(__C, __D);
408
  return (__m64)((__v2du)(__C))[0];
409
}
410
 
411
extern __inline __m128i
412
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413
    _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
414
  __v4si __C = vec_unpackh((__v8hi)__A);
415
  __v4si __D = vec_unpackh((__v8hi)__B);
416
  __C = vec_mul(__C, __D);
417
  __D = vec_unpackl((__v8hi)__A);
418
  __v4si __E = vec_unpackl((__v8hi)__B);
419
  __D = vec_mul(__D, __E);
420
  const __v4su __shift = vec_splats((unsigned int)14);
421
  __C = vec_sr(__C, __shift);
422
  __D = vec_sr(__D, __shift);
423
  const __v4si __ones = vec_splats((signed int)1);
424
  __C = vec_add(__C, __ones);
425
  __C = vec_sr(__C, (__v4su)__ones);
426
  __D = vec_add(__D, __ones);
427
  __D = vec_sr(__D, (__v4su)__ones);
428
  return (__m128i)vec_pack(__C, __D);
429
}
430
 
431
extern __inline __m64
432
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433
    _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
434
  __v4si __C = (__v4si)(__v2du){__A, __A};
435
  __C = vec_unpackh((__v8hi)__C);
436
  __v4si __D = (__v4si)(__v2du){__B, __B};
437
  __D = vec_unpackh((__v8hi)__D);
438
  __C = vec_mul(__C, __D);
439
  const __v4su __shift = vec_splats((unsigned int)14);
440
  __C = vec_sr(__C, __shift);
441
  const __v4si __ones = vec_splats((signed int)1);
442
  __C = vec_add(__C, __ones);
443
  __C = vec_sr(__C, (__v4su)__ones);
444
  __v8hi __E = vec_pack(__C, __D);
445
  return (__m64)((__v2du)(__E))[0];
446
}
447
 
448
#else
449
#include_next <tmmintrin.h>
450
#endif /* defined(__powerpc64__) &&                                            \
451
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
452
 
453
#endif /* TMMINTRIN_H_ */