Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
#ifndef __EMMINTRIN_H
11
#define __EMMINTRIN_H
12
 
13
#if !defined(__i386__) && !defined(__x86_64__)
14
#error "This header is only meant to be used on x86 and x64 architecture"
15
#endif
16
 
17
#include <xmmintrin.h>
18
 
19
typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20
typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
 
22
typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23
typedef long long __m128i_u
24
    __attribute__((__vector_size__(16), __aligned__(1)));
25
 
26
/* Type defines.  */
27
typedef double __v2df __attribute__((__vector_size__(16)));
28
typedef long long __v2di __attribute__((__vector_size__(16)));
29
typedef short __v8hi __attribute__((__vector_size__(16)));
30
typedef char __v16qi __attribute__((__vector_size__(16)));
31
 
32
/* Unsigned types */
33
typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34
typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35
typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
 
37
/* We need an explicitly signed variant for char. Note that this shouldn't
38
 * appear in the interface though. */
39
typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
 
41
#ifdef __SSE2__
42
/* Both _Float16 and __bf16 require SSE2 being enabled. */
43
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
 
47
typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49
#endif
50
 
51
/* Define the default attributes for the functions in this file. */
52
#define __DEFAULT_FN_ATTRS                                                     \
53
  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
54
                 __min_vector_width__(128)))
55
#define __DEFAULT_FN_ATTRS_MMX                                                 \
56
  __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"),       \
57
                 __min_vector_width__(64)))
58
 
59
/// Adds lower double-precision values in both operands and returns the
60
///    sum in the lower 64 bits of the result. The upper 64 bits of the result
61
///    are copied from the upper double-precision value of the first operand.
62
///
63
/// \headerfile <x86intrin.h>
64
///
65
/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66
///
67
/// \param __a
68
///    A 128-bit vector of [2 x double] containing one of the source operands.
69
/// \param __b
70
///    A 128-bit vector of [2 x double] containing one of the source operands.
71
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72
///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
73
///    from the upper 64 bits of the first source operand.
74
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75
                                                        __m128d __b) {
76
  __a[0] += __b[0];
77
  return __a;
78
}
79
 
80
/// Adds two 128-bit vectors of [2 x double].
81
///
82
/// \headerfile <x86intrin.h>
83
///
84
/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85
///
86
/// \param __a
87
///    A 128-bit vector of [2 x double] containing one of the source operands.
88
/// \param __b
89
///    A 128-bit vector of [2 x double] containing one of the source operands.
90
/// \returns A 128-bit vector of [2 x double] containing the sums of both
91
///    operands.
92
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93
                                                        __m128d __b) {
94
  return (__m128d)((__v2df)__a + (__v2df)__b);
95
}
96
 
97
/// Subtracts the lower double-precision value of the second operand
98
///    from the lower double-precision value of the first operand and returns
99
///    the difference in the lower 64 bits of the result. The upper 64 bits of
100
///    the result are copied from the upper double-precision value of the first
101
///    operand.
102
///
103
/// \headerfile <x86intrin.h>
104
///
105
/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106
///
107
/// \param __a
108
///    A 128-bit vector of [2 x double] containing the minuend.
109
/// \param __b
110
///    A 128-bit vector of [2 x double] containing the subtrahend.
111
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112
///    difference of the lower 64 bits of both operands. The upper 64 bits are
113
///    copied from the upper 64 bits of the first source operand.
114
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115
                                                        __m128d __b) {
116
  __a[0] -= __b[0];
117
  return __a;
118
}
119
 
120
/// Subtracts two 128-bit vectors of [2 x double].
121
///
122
/// \headerfile <x86intrin.h>
123
///
124
/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125
///
126
/// \param __a
127
///    A 128-bit vector of [2 x double] containing the minuend.
128
/// \param __b
129
///    A 128-bit vector of [2 x double] containing the subtrahend.
130
/// \returns A 128-bit vector of [2 x double] containing the differences between
131
///    both operands.
132
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133
                                                        __m128d __b) {
134
  return (__m128d)((__v2df)__a - (__v2df)__b);
135
}
136
 
137
/// Multiplies lower double-precision values in both operands and returns
138
///    the product in the lower 64 bits of the result. The upper 64 bits of the
139
///    result are copied from the upper double-precision value of the first
140
///    operand.
141
///
142
/// \headerfile <x86intrin.h>
143
///
144
/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145
///
146
/// \param __a
147
///    A 128-bit vector of [2 x double] containing one of the source operands.
148
/// \param __b
149
///    A 128-bit vector of [2 x double] containing one of the source operands.
150
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151
///    product of the lower 64 bits of both operands. The upper 64 bits are
152
///    copied from the upper 64 bits of the first source operand.
153
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154
                                                        __m128d __b) {
155
  __a[0] *= __b[0];
156
  return __a;
157
}
158
 
159
/// Multiplies two 128-bit vectors of [2 x double].
160
///
161
/// \headerfile <x86intrin.h>
162
///
163
/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164
///
165
/// \param __a
166
///    A 128-bit vector of [2 x double] containing one of the operands.
167
/// \param __b
168
///    A 128-bit vector of [2 x double] containing one of the operands.
169
/// \returns A 128-bit vector of [2 x double] containing the products of both
170
///    operands.
171
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172
                                                        __m128d __b) {
173
  return (__m128d)((__v2df)__a * (__v2df)__b);
174
}
175
 
176
/// Divides the lower double-precision value of the first operand by the
177
///    lower double-precision value of the second operand and returns the
178
///    quotient in the lower 64 bits of the result. The upper 64 bits of the
179
///    result are copied from the upper double-precision value of the first
180
///    operand.
181
///
182
/// \headerfile <x86intrin.h>
183
///
184
/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185
///
186
/// \param __a
187
///    A 128-bit vector of [2 x double] containing the dividend.
188
/// \param __b
189
///    A 128-bit vector of [2 x double] containing divisor.
190
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191
///    quotient of the lower 64 bits of both operands. The upper 64 bits are
192
///    copied from the upper 64 bits of the first source operand.
193
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194
                                                        __m128d __b) {
195
  __a[0] /= __b[0];
196
  return __a;
197
}
198
 
199
/// Performs an element-by-element division of two 128-bit vectors of
200
///    [2 x double].
201
///
202
/// \headerfile <x86intrin.h>
203
///
204
/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205
///
206
/// \param __a
207
///    A 128-bit vector of [2 x double] containing the dividend.
208
/// \param __b
209
///    A 128-bit vector of [2 x double] containing the divisor.
210
/// \returns A 128-bit vector of [2 x double] containing the quotients of both
211
///    operands.
212
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213
                                                        __m128d __b) {
214
  return (__m128d)((__v2df)__a / (__v2df)__b);
215
}
216
 
217
/// Calculates the square root of the lower double-precision value of
218
///    the second operand and returns it in the lower 64 bits of the result.
219
///    The upper 64 bits of the result are copied from the upper
220
///    double-precision value of the first operand.
221
///
222
/// \headerfile <x86intrin.h>
223
///
224
/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225
///
226
/// \param __a
227
///    A 128-bit vector of [2 x double] containing one of the operands. The
228
///    upper 64 bits of this operand are copied to the upper 64 bits of the
229
///    result.
230
/// \param __b
231
///    A 128-bit vector of [2 x double] containing one of the operands. The
232
///    square root is calculated using the lower 64 bits of this operand.
233
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234
///    square root of the lower 64 bits of operand \a __b, and whose upper 64
235
///    bits are copied from the upper 64 bits of operand \a __a.
236
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237
                                                         __m128d __b) {
238
  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239
  return __extension__(__m128d){__c[0], __a[1]};
240
}
241
 
242
/// Calculates the square root of the each of two values stored in a
243
///    128-bit vector of [2 x double].
244
///
245
/// \headerfile <x86intrin.h>
246
///
247
/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248
///
249
/// \param __a
250
///    A 128-bit vector of [2 x double].
251
/// \returns A 128-bit vector of [2 x double] containing the square roots of the
252
///    values in the operand.
253
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254
  return __builtin_ia32_sqrtpd((__v2df)__a);
255
}
256
 
257
/// Compares lower 64-bit double-precision values of both operands, and
258
///    returns the lesser of the pair of values in the lower 64-bits of the
259
///    result. The upper 64 bits of the result are copied from the upper
260
///    double-precision value of the first operand.
261
///
262
/// \headerfile <x86intrin.h>
263
///
264
/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
265
///
266
/// \param __a
267
///    A 128-bit vector of [2 x double] containing one of the operands. The
268
///    lower 64 bits of this operand are used in the comparison.
269
/// \param __b
270
///    A 128-bit vector of [2 x double] containing one of the operands. The
271
///    lower 64 bits of this operand are used in the comparison.
272
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
273
///    minimum value between both operands. The upper 64 bits are copied from
274
///    the upper 64 bits of the first source operand.
275
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
276
                                                        __m128d __b) {
277
  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
278
}
279
 
280
/// Performs element-by-element comparison of the two 128-bit vectors of
281
///    [2 x double] and returns the vector containing the lesser of each pair of
282
///    values.
283
///
284
/// \headerfile <x86intrin.h>
285
///
286
/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
287
///
288
/// \param __a
289
///    A 128-bit vector of [2 x double] containing one of the operands.
290
/// \param __b
291
///    A 128-bit vector of [2 x double] containing one of the operands.
292
/// \returns A 128-bit vector of [2 x double] containing the minimum values
293
///    between both operands.
294
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
295
                                                        __m128d __b) {
296
  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
297
}
298
 
299
/// Compares lower 64-bit double-precision values of both operands, and
300
///    returns the greater of the pair of values in the lower 64-bits of the
301
///    result. The upper 64 bits of the result are copied from the upper
302
///    double-precision value of the first operand.
303
///
304
/// \headerfile <x86intrin.h>
305
///
306
/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
307
///
308
/// \param __a
309
///    A 128-bit vector of [2 x double] containing one of the operands. The
310
///    lower 64 bits of this operand are used in the comparison.
311
/// \param __b
312
///    A 128-bit vector of [2 x double] containing one of the operands. The
313
///    lower 64 bits of this operand are used in the comparison.
314
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
315
///    maximum value between both operands. The upper 64 bits are copied from
316
///    the upper 64 bits of the first source operand.
317
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
318
                                                        __m128d __b) {
319
  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
320
}
321
 
322
/// Performs element-by-element comparison of the two 128-bit vectors of
323
///    [2 x double] and returns the vector containing the greater of each pair
324
///    of values.
325
///
326
/// \headerfile <x86intrin.h>
327
///
328
/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
329
///
330
/// \param __a
331
///    A 128-bit vector of [2 x double] containing one of the operands.
332
/// \param __b
333
///    A 128-bit vector of [2 x double] containing one of the operands.
334
/// \returns A 128-bit vector of [2 x double] containing the maximum values
335
///    between both operands.
336
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
337
                                                        __m128d __b) {
338
  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
339
}
340
 
341
/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
342
///
343
/// \headerfile <x86intrin.h>
344
///
345
/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
346
///
347
/// \param __a
348
///    A 128-bit vector of [2 x double] containing one of the source operands.
349
/// \param __b
350
///    A 128-bit vector of [2 x double] containing one of the source operands.
351
/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
352
///    values between both operands.
353
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
354
                                                        __m128d __b) {
355
  return (__m128d)((__v2du)__a & (__v2du)__b);
356
}
357
 
358
/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
359
///    the one's complement of the values contained in the first source operand.
360
///
361
/// \headerfile <x86intrin.h>
362
///
363
/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
364
///
365
/// \param __a
366
///    A 128-bit vector of [2 x double] containing the left source operand. The
367
///    one's complement of this value is used in the bitwise AND.
368
/// \param __b
369
///    A 128-bit vector of [2 x double] containing the right source operand.
370
/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
371
///    values in the second operand and the one's complement of the first
372
///    operand.
373
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
374
                                                           __m128d __b) {
375
  return (__m128d)(~(__v2du)__a & (__v2du)__b);
376
}
377
 
378
/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
379
///
380
/// \headerfile <x86intrin.h>
381
///
382
/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
383
///
384
/// \param __a
385
///    A 128-bit vector of [2 x double] containing one of the source operands.
386
/// \param __b
387
///    A 128-bit vector of [2 x double] containing one of the source operands.
388
/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
389
///    values between both operands.
390
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
391
                                                       __m128d __b) {
392
  return (__m128d)((__v2du)__a | (__v2du)__b);
393
}
394
 
395
/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
396
///
397
/// \headerfile <x86intrin.h>
398
///
399
/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
400
///
401
/// \param __a
402
///    A 128-bit vector of [2 x double] containing one of the source operands.
403
/// \param __b
404
///    A 128-bit vector of [2 x double] containing one of the source operands.
405
/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
406
///    values between both operands.
407
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
408
                                                        __m128d __b) {
409
  return (__m128d)((__v2du)__a ^ (__v2du)__b);
410
}
411
 
412
/// Compares each of the corresponding double-precision values of the
413
///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
414
///    for false, 0xFFFFFFFFFFFFFFFF for true.
415
///
416
/// \headerfile <x86intrin.h>
417
///
418
/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
419
///
420
/// \param __a
421
///    A 128-bit vector of [2 x double].
422
/// \param __b
423
///    A 128-bit vector of [2 x double].
424
/// \returns A 128-bit vector containing the comparison results.
425
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
426
                                                          __m128d __b) {
427
  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
428
}
429
 
430
/// Compares each of the corresponding double-precision values of the
431
///    128-bit vectors of [2 x double] to determine if the values in the first
432
///    operand are less than those in the second operand. Each comparison
433
///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
434
///
435
/// \headerfile <x86intrin.h>
436
///
437
/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
438
///
439
/// \param __a
440
///    A 128-bit vector of [2 x double].
441
/// \param __b
442
///    A 128-bit vector of [2 x double].
443
/// \returns A 128-bit vector containing the comparison results.
444
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
445
                                                          __m128d __b) {
446
  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
447
}
448
 
449
/// Compares each of the corresponding double-precision values of the
450
///    128-bit vectors of [2 x double] to determine if the values in the first
451
///    operand are less than or equal to those in the second operand.
452
///
453
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
454
///
455
/// \headerfile <x86intrin.h>
456
///
457
/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
458
///
459
/// \param __a
460
///    A 128-bit vector of [2 x double].
461
/// \param __b
462
///    A 128-bit vector of [2 x double].
463
/// \returns A 128-bit vector containing the comparison results.
464
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
465
                                                          __m128d __b) {
466
  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
467
}
468
 
469
/// Compares each of the corresponding double-precision values of the
470
///    128-bit vectors of [2 x double] to determine if the values in the first
471
///    operand are greater than those in the second operand.
472
///
473
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
474
///
475
/// \headerfile <x86intrin.h>
476
///
477
/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
478
///
479
/// \param __a
480
///    A 128-bit vector of [2 x double].
481
/// \param __b
482
///    A 128-bit vector of [2 x double].
483
/// \returns A 128-bit vector containing the comparison results.
484
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
485
                                                          __m128d __b) {
486
  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
487
}
488
 
489
/// Compares each of the corresponding double-precision values of the
490
///    128-bit vectors of [2 x double] to determine if the values in the first
491
///    operand are greater than or equal to those in the second operand.
492
///
493
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
494
///
495
/// \headerfile <x86intrin.h>
496
///
497
/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
498
///
499
/// \param __a
500
///    A 128-bit vector of [2 x double].
501
/// \param __b
502
///    A 128-bit vector of [2 x double].
503
/// \returns A 128-bit vector containing the comparison results.
504
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
505
                                                          __m128d __b) {
506
  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
507
}
508
 
509
/// Compares each of the corresponding double-precision values of the
510
///    128-bit vectors of [2 x double] to determine if the values in the first
511
///    operand are ordered with respect to those in the second operand.
512
///
513
///    A pair of double-precision values are "ordered" with respect to each
514
///    other if neither value is a NaN. Each comparison yields 0x0 for false,
515
///    0xFFFFFFFFFFFFFFFF for true.
516
///
517
/// \headerfile <x86intrin.h>
518
///
519
/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
520
///
521
/// \param __a
522
///    A 128-bit vector of [2 x double].
523
/// \param __b
524
///    A 128-bit vector of [2 x double].
525
/// \returns A 128-bit vector containing the comparison results.
526
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
527
                                                           __m128d __b) {
528
  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
529
}
530
 
531
/// Compares each of the corresponding double-precision values of the
532
///    128-bit vectors of [2 x double] to determine if the values in the first
533
///    operand are unordered with respect to those in the second operand.
534
///
535
///    A pair of double-precision values are "unordered" with respect to each
536
///    other if one or both values are NaN. Each comparison yields 0x0 for
537
///    false, 0xFFFFFFFFFFFFFFFF for true.
538
///
539
/// \headerfile <x86intrin.h>
540
///
541
/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
542
///   instruction.
543
///
544
/// \param __a
545
///    A 128-bit vector of [2 x double].
546
/// \param __b
547
///    A 128-bit vector of [2 x double].
548
/// \returns A 128-bit vector containing the comparison results.
549
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
550
                                                             __m128d __b) {
551
  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
552
}
553
 
554
/// Compares each of the corresponding double-precision values of the
555
///    128-bit vectors of [2 x double] to determine if the values in the first
556
///    operand are unequal to those in the second operand.
557
///
558
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
559
///
560
/// \headerfile <x86intrin.h>
561
///
562
/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
563
///
564
/// \param __a
565
///    A 128-bit vector of [2 x double].
566
/// \param __b
567
///    A 128-bit vector of [2 x double].
568
/// \returns A 128-bit vector containing the comparison results.
569
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
570
                                                           __m128d __b) {
571
  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
572
}
573
 
574
/// Compares each of the corresponding double-precision values of the
575
///    128-bit vectors of [2 x double] to determine if the values in the first
576
///    operand are not less than those in the second operand.
577
///
578
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
579
///
580
/// \headerfile <x86intrin.h>
581
///
582
/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
583
///
584
/// \param __a
585
///    A 128-bit vector of [2 x double].
586
/// \param __b
587
///    A 128-bit vector of [2 x double].
588
/// \returns A 128-bit vector containing the comparison results.
589
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
590
                                                           __m128d __b) {
591
  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
592
}
593
 
594
/// Compares each of the corresponding double-precision values of the
595
///    128-bit vectors of [2 x double] to determine if the values in the first
596
///    operand are not less than or equal to those in the second operand.
597
///
598
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
599
///
600
/// \headerfile <x86intrin.h>
601
///
602
/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
603
///
604
/// \param __a
605
///    A 128-bit vector of [2 x double].
606
/// \param __b
607
///    A 128-bit vector of [2 x double].
608
/// \returns A 128-bit vector containing the comparison results.
609
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
610
                                                           __m128d __b) {
611
  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
612
}
613
 
614
/// Compares each of the corresponding double-precision values of the
615
///    128-bit vectors of [2 x double] to determine if the values in the first
616
///    operand are not greater than those in the second operand.
617
///
618
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
619
///
620
/// \headerfile <x86intrin.h>
621
///
622
/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
623
///
624
/// \param __a
625
///    A 128-bit vector of [2 x double].
626
/// \param __b
627
///    A 128-bit vector of [2 x double].
628
/// \returns A 128-bit vector containing the comparison results.
629
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
630
                                                           __m128d __b) {
631
  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
632
}
633
 
634
/// Compares each of the corresponding double-precision values of the
635
///    128-bit vectors of [2 x double] to determine if the values in the first
636
///    operand are not greater than or equal to those in the second operand.
637
///
638
///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
639
///
640
/// \headerfile <x86intrin.h>
641
///
642
/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
643
///
644
/// \param __a
645
///    A 128-bit vector of [2 x double].
646
/// \param __b
647
///    A 128-bit vector of [2 x double].
648
/// \returns A 128-bit vector containing the comparison results.
649
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
650
                                                           __m128d __b) {
651
  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
652
}
653
 
654
/// Compares the lower double-precision floating-point values in each of
655
///    the two 128-bit floating-point vectors of [2 x double] for equality.
656
///
657
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658
///
659
/// \headerfile <x86intrin.h>
660
///
661
/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
662
///
663
/// \param __a
664
///    A 128-bit vector of [2 x double]. The lower double-precision value is
665
///    compared to the lower double-precision value of \a __b.
666
/// \param __b
667
///    A 128-bit vector of [2 x double]. The lower double-precision value is
668
///    compared to the lower double-precision value of \a __a.
669
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
670
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
671
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
672
                                                          __m128d __b) {
673
  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
674
}
675
 
676
/// Compares the lower double-precision floating-point values in each of
677
///    the two 128-bit floating-point vectors of [2 x double] to determine if
678
///    the value in the first parameter is less than the corresponding value in
679
///    the second parameter.
680
///
681
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
682
///
683
/// \headerfile <x86intrin.h>
684
///
685
/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
686
///
687
/// \param __a
688
///    A 128-bit vector of [2 x double]. The lower double-precision value is
689
///    compared to the lower double-precision value of \a __b.
690
/// \param __b
691
///    A 128-bit vector of [2 x double]. The lower double-precision value is
692
///    compared to the lower double-precision value of \a __a.
693
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
694
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
695
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
696
                                                          __m128d __b) {
697
  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
698
}
699
 
700
/// Compares the lower double-precision floating-point values in each of
701
///    the two 128-bit floating-point vectors of [2 x double] to determine if
702
///    the value in the first parameter is less than or equal to the
703
///    corresponding value in the second parameter.
704
///
705
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
706
///
707
/// \headerfile <x86intrin.h>
708
///
709
/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
710
///
711
/// \param __a
712
///    A 128-bit vector of [2 x double]. The lower double-precision value is
713
///    compared to the lower double-precision value of \a __b.
714
/// \param __b
715
///    A 128-bit vector of [2 x double]. The lower double-precision value is
716
///    compared to the lower double-precision value of \a __a.
717
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
718
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
719
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
720
                                                          __m128d __b) {
721
  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
722
}
723
 
724
/// Compares the lower double-precision floating-point values in each of
725
///    the two 128-bit floating-point vectors of [2 x double] to determine if
726
///    the value in the first parameter is greater than the corresponding value
727
///    in the second parameter.
728
///
729
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
730
///
731
/// \headerfile <x86intrin.h>
732
///
733
/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
734
///
735
/// \param __a
736
///     A 128-bit vector of [2 x double]. The lower double-precision value is
737
///     compared to the lower double-precision value of \a __b.
738
/// \param __b
739
///     A 128-bit vector of [2 x double]. The lower double-precision value is
740
///     compared to the lower double-precision value of \a __a.
741
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
742
///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
743
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
744
                                                          __m128d __b) {
745
  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
746
  return __extension__(__m128d){__c[0], __a[1]};
747
}
748
 
749
/// Compares the lower double-precision floating-point values in each of
750
///    the two 128-bit floating-point vectors of [2 x double] to determine if
751
///    the value in the first parameter is greater than or equal to the
752
///    corresponding value in the second parameter.
753
///
754
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
755
///
756
/// \headerfile <x86intrin.h>
757
///
758
/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
759
///
760
/// \param __a
761
///    A 128-bit vector of [2 x double]. The lower double-precision value is
762
///    compared to the lower double-precision value of \a __b.
763
/// \param __b
764
///    A 128-bit vector of [2 x double]. The lower double-precision value is
765
///    compared to the lower double-precision value of \a __a.
766
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
767
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
768
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
769
                                                          __m128d __b) {
770
  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
771
  return __extension__(__m128d){__c[0], __a[1]};
772
}
773
 
774
/// Compares the lower double-precision floating-point values in each of
775
///    the two 128-bit floating-point vectors of [2 x double] to determine if
776
///    the value in the first parameter is "ordered" with respect to the
777
///    corresponding value in the second parameter.
778
///
779
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
780
///    of double-precision values are "ordered" with respect to each other if
781
///    neither value is a NaN.
782
///
783
/// \headerfile <x86intrin.h>
784
///
785
/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
786
///
787
/// \param __a
788
///    A 128-bit vector of [2 x double]. The lower double-precision value is
789
///    compared to the lower double-precision value of \a __b.
790
/// \param __b
791
///    A 128-bit vector of [2 x double]. The lower double-precision value is
792
///    compared to the lower double-precision value of \a __a.
793
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
794
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
795
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
796
                                                           __m128d __b) {
797
  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
798
}
799
 
800
/// Compares the lower double-precision floating-point values in each of
801
///    the two 128-bit floating-point vectors of [2 x double] to determine if
802
///    the value in the first parameter is "unordered" with respect to the
803
///    corresponding value in the second parameter.
804
///
805
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
806
///    of double-precision values are "unordered" with respect to each other if
807
///    one or both values are NaN.
808
///
809
/// \headerfile <x86intrin.h>
810
///
811
/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
812
///   instruction.
813
///
814
/// \param __a
815
///    A 128-bit vector of [2 x double]. The lower double-precision value is
816
///    compared to the lower double-precision value of \a __b.
817
/// \param __b
818
///    A 128-bit vector of [2 x double]. The lower double-precision value is
819
///    compared to the lower double-precision value of \a __a.
820
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
821
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
822
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
823
                                                             __m128d __b) {
824
  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
825
}
826
 
827
/// Compares the lower double-precision floating-point values in each of
828
///    the two 128-bit floating-point vectors of [2 x double] to determine if
829
///    the value in the first parameter is unequal to the corresponding value in
830
///    the second parameter.
831
///
832
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
833
///
834
/// \headerfile <x86intrin.h>
835
///
836
/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
837
///
838
/// \param __a
839
///    A 128-bit vector of [2 x double]. The lower double-precision value is
840
///    compared to the lower double-precision value of \a __b.
841
/// \param __b
842
///    A 128-bit vector of [2 x double]. The lower double-precision value is
843
///    compared to the lower double-precision value of \a __a.
844
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
845
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
846
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
847
                                                           __m128d __b) {
848
  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
849
}
850
 
851
/// Compares the lower double-precision floating-point values in each of
852
///    the two 128-bit floating-point vectors of [2 x double] to determine if
853
///    the value in the first parameter is not less than the corresponding
854
///    value in the second parameter.
855
///
856
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
857
///
858
/// \headerfile <x86intrin.h>
859
///
860
/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
861
///
862
/// \param __a
863
///    A 128-bit vector of [2 x double]. The lower double-precision value is
864
///    compared to the lower double-precision value of \a __b.
865
/// \param __b
866
///    A 128-bit vector of [2 x double]. The lower double-precision value is
867
///    compared to the lower double-precision value of \a __a.
868
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
869
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
870
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
871
                                                           __m128d __b) {
872
  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
873
}
874
 
875
/// Compares the lower double-precision floating-point values in each of
876
///    the two 128-bit floating-point vectors of [2 x double] to determine if
877
///    the value in the first parameter is not less than or equal to the
878
///    corresponding value in the second parameter.
879
///
880
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
881
///
882
/// \headerfile <x86intrin.h>
883
///
884
/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
885
///
886
/// \param __a
887
///    A 128-bit vector of [2 x double]. The lower double-precision value is
888
///    compared to the lower double-precision value of \a __b.
889
/// \param __b
890
///    A 128-bit vector of [2 x double]. The lower double-precision value is
891
///    compared to the lower double-precision value of \a __a.
892
/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
893
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
894
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
895
                                                           __m128d __b) {
896
  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
897
}
898
 
899
/// Compares the lower double-precision floating-point values in each of
900
///    the two 128-bit floating-point vectors of [2 x double] to determine if
901
///    the value in the first parameter is not greater than the corresponding
902
///    value in the second parameter.
903
///
904
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
905
///
906
/// \headerfile <x86intrin.h>
907
///
908
/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
909
///
910
/// \param __a
911
///    A 128-bit vector of [2 x double]. The lower double-precision value is
912
///    compared to the lower double-precision value of \a __b.
913
/// \param __b
914
///    A 128-bit vector of [2 x double]. The lower double-precision value is
915
///    compared to the lower double-precision value of \a __a.
916
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
917
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
918
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
919
                                                           __m128d __b) {
920
  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
921
  return __extension__(__m128d){__c[0], __a[1]};
922
}
923
 
924
/// Compares the lower double-precision floating-point values in each of
925
///    the two 128-bit floating-point vectors of [2 x double] to determine if
926
///    the value in the first parameter is not greater than or equal to the
927
///    corresponding value in the second parameter.
928
///
929
///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
930
///
931
/// \headerfile <x86intrin.h>
932
///
933
/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
934
///
935
/// \param __a
936
///    A 128-bit vector of [2 x double]. The lower double-precision value is
937
///    compared to the lower double-precision value of \a __b.
938
/// \param __b
939
///    A 128-bit vector of [2 x double]. The lower double-precision value is
940
///    compared to the lower double-precision value of \a __a.
941
/// \returns A 128-bit vector. The lower 64 bits contains the comparison
942
///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
943
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
944
                                                           __m128d __b) {
945
  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
946
  return __extension__(__m128d){__c[0], __a[1]};
947
}
948
 
949
/// Compares the lower double-precision floating-point values in each of
950
///    the two 128-bit floating-point vectors of [2 x double] for equality.
951
///
952
///    The comparison yields 0 for false, 1 for true. If either of the two
953
///    lower double-precision values is NaN, 0 is returned.
954
///
955
/// \headerfile <x86intrin.h>
956
///
957
/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
958
///
959
/// \param __a
960
///    A 128-bit vector of [2 x double]. The lower double-precision value is
961
///    compared to the lower double-precision value of \a __b.
962
/// \param __b
963
///    A 128-bit vector of [2 x double]. The lower double-precision value is
964
///    compared to the lower double-precision value of \a __a.
965
/// \returns An integer containing the comparison results. If either of the two
966
///    lower double-precision values is NaN, 0 is returned.
967
static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
968
                                                       __m128d __b) {
969
  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
970
}
971
 
972
/// Compares the lower double-precision floating-point values in each of
973
///    the two 128-bit floating-point vectors of [2 x double] to determine if
974
///    the value in the first parameter is less than the corresponding value in
975
///    the second parameter.
976
///
977
///    The comparison yields 0 for false, 1 for true. If either of the two
978
///    lower double-precision values is NaN, 0 is returned.
979
///
980
/// \headerfile <x86intrin.h>
981
///
982
/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
983
///
984
/// \param __a
985
///    A 128-bit vector of [2 x double]. The lower double-precision value is
986
///    compared to the lower double-precision value of \a __b.
987
/// \param __b
988
///    A 128-bit vector of [2 x double]. The lower double-precision value is
989
///    compared to the lower double-precision value of \a __a.
990
/// \returns An integer containing the comparison results. If either of the two
991
///     lower double-precision values is NaN, 0 is returned.
992
static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
993
                                                       __m128d __b) {
994
  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
995
}
996
 
997
/// Compares the lower double-precision floating-point values in each of
998
///    the two 128-bit floating-point vectors of [2 x double] to determine if
999
///    the value in the first parameter is less than or equal to the
1000
///    corresponding value in the second parameter.
1001
///
1002
///    The comparison yields 0 for false, 1 for true. If either of the two
1003
///    lower double-precision values is NaN, 0 is returned.
1004
///
1005
/// \headerfile <x86intrin.h>
1006
///
1007
/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1008
///
1009
/// \param __a
1010
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1011
///    compared to the lower double-precision value of \a __b.
1012
/// \param __b
1013
///     A 128-bit vector of [2 x double]. The lower double-precision value is
1014
///     compared to the lower double-precision value of \a __a.
1015
/// \returns An integer containing the comparison results. If either of the two
1016
///     lower double-precision values is NaN, 0 is returned.
1017
static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1018
                                                       __m128d __b) {
1019
  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1020
}
1021
 
1022
/// Compares the lower double-precision floating-point values in each of
1023
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1024
///    the value in the first parameter is greater than the corresponding value
1025
///    in the second parameter.
1026
///
1027
///    The comparison yields 0 for false, 1 for true. If either of the two
1028
///    lower double-precision values is NaN, 0 is returned.
1029
///
1030
/// \headerfile <x86intrin.h>
1031
///
1032
/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1033
///
1034
/// \param __a
1035
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1036
///    compared to the lower double-precision value of \a __b.
1037
/// \param __b
1038
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1039
///    compared to the lower double-precision value of \a __a.
1040
/// \returns An integer containing the comparison results. If either of the two
1041
///     lower double-precision values is NaN, 0 is returned.
1042
static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1043
                                                       __m128d __b) {
1044
  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1045
}
1046
 
1047
/// Compares the lower double-precision floating-point values in each of
1048
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1049
///    the value in the first parameter is greater than or equal to the
1050
///    corresponding value in the second parameter.
1051
///
1052
///    The comparison yields 0 for false, 1 for true. If either of the two
1053
///    lower double-precision values is NaN, 0 is returned.
1054
///
1055
/// \headerfile <x86intrin.h>
1056
///
1057
/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1058
///
1059
/// \param __a
1060
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1061
///    compared to the lower double-precision value of \a __b.
1062
/// \param __b
1063
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1064
///    compared to the lower double-precision value of \a __a.
1065
/// \returns An integer containing the comparison results. If either of the two
1066
///    lower double-precision values is NaN, 0 is returned.
1067
static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1068
                                                       __m128d __b) {
1069
  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1070
}
1071
 
1072
/// Compares the lower double-precision floating-point values in each of
1073
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1074
///    the value in the first parameter is unequal to the corresponding value in
1075
///    the second parameter.
1076
///
1077
///    The comparison yields 0 for false, 1 for true. If either of the two
1078
///    lower double-precision values is NaN, 1 is returned.
1079
///
1080
/// \headerfile <x86intrin.h>
1081
///
1082
/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1083
///
1084
/// \param __a
1085
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1086
///    compared to the lower double-precision value of \a __b.
1087
/// \param __b
1088
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1089
///    compared to the lower double-precision value of \a __a.
1090
/// \returns An integer containing the comparison results. If either of the two
1091
///     lower double-precision values is NaN, 1 is returned.
1092
static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1093
                                                        __m128d __b) {
1094
  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1095
}
1096
 
1097
/// Compares the lower double-precision floating-point values in each of
1098
///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1099
///    comparison yields 0 for false, 1 for true.
1100
///
1101
///    If either of the two lower double-precision values is NaN, 0 is returned.
1102
///
1103
/// \headerfile <x86intrin.h>
1104
///
1105
/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1106
///
1107
/// \param __a
1108
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1109
///    compared to the lower double-precision value of \a __b.
1110
/// \param __b
1111
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1112
///    compared to the lower double-precision value of \a __a.
1113
/// \returns An integer containing the comparison results. If either of the two
1114
///    lower double-precision values is NaN, 0 is returned.
1115
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1116
                                                        __m128d __b) {
1117
  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1118
}
1119
 
1120
/// Compares the lower double-precision floating-point values in each of
1121
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1122
///    the value in the first parameter is less than the corresponding value in
1123
///    the second parameter.
1124
///
1125
///    The comparison yields 0 for false, 1 for true. If either of the two lower
1126
///    double-precision values is NaN, 0 is returned.
1127
///
1128
/// \headerfile <x86intrin.h>
1129
///
1130
/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1131
///
1132
/// \param __a
1133
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1134
///    compared to the lower double-precision value of \a __b.
1135
/// \param __b
1136
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1137
///    compared to the lower double-precision value of \a __a.
1138
/// \returns An integer containing the comparison results. If either of the two
1139
///    lower double-precision values is NaN, 0 is returned.
1140
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1141
                                                        __m128d __b) {
1142
  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1143
}
1144
 
1145
/// Compares the lower double-precision floating-point values in each of
1146
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1147
///    the value in the first parameter is less than or equal to the
1148
///    corresponding value in the second parameter.
1149
///
1150
///    The comparison yields 0 for false, 1 for true. If either of the two lower
1151
///    double-precision values is NaN, 0 is returned.
1152
///
1153
/// \headerfile <x86intrin.h>
1154
///
1155
/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1156
///
1157
/// \param __a
1158
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1159
///    compared to the lower double-precision value of \a __b.
1160
/// \param __b
1161
///     A 128-bit vector of [2 x double]. The lower double-precision value is
1162
///     compared to the lower double-precision value of \a __a.
1163
/// \returns An integer containing the comparison results. If either of the two
1164
///     lower double-precision values is NaN, 0 is returned.
1165
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1166
                                                        __m128d __b) {
1167
  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1168
}
1169
 
1170
/// Compares the lower double-precision floating-point values in each of
1171
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1172
///    the value in the first parameter is greater than the corresponding value
1173
///    in the second parameter.
1174
///
1175
///    The comparison yields 0 for false, 1 for true. If either of the two lower
1176
///    double-precision values is NaN, 0 is returned.
1177
///
1178
/// \headerfile <x86intrin.h>
1179
///
1180
/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1181
///
1182
/// \param __a
1183
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1184
///    compared to the lower double-precision value of \a __b.
1185
/// \param __b
1186
///     A 128-bit vector of [2 x double]. The lower double-precision value is
1187
///     compared to the lower double-precision value of \a __a.
1188
/// \returns An integer containing the comparison results. If either of the two
1189
///     lower double-precision values is NaN, 0 is returned.
1190
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1191
                                                        __m128d __b) {
1192
  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1193
}
1194
 
1195
/// Compares the lower double-precision floating-point values in each of
1196
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1197
///    the value in the first parameter is greater than or equal to the
1198
///    corresponding value in the second parameter.
1199
///
1200
///    The comparison yields 0 for false, 1 for true.  If either of the two
1201
///    lower double-precision values is NaN, 0 is returned.
1202
///
1203
/// \headerfile <x86intrin.h>
1204
///
1205
/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1206
///
1207
/// \param __a
1208
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1209
///    compared to the lower double-precision value of \a __b.
1210
/// \param __b
1211
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1212
///    compared to the lower double-precision value of \a __a.
1213
/// \returns An integer containing the comparison results. If either of the two
1214
///    lower double-precision values is NaN, 0 is returned.
1215
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1216
                                                        __m128d __b) {
1217
  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1218
}
1219
 
1220
/// Compares the lower double-precision floating-point values in each of
1221
///    the two 128-bit floating-point vectors of [2 x double] to determine if
1222
///    the value in the first parameter is unequal to the corresponding value in
1223
///    the second parameter.
1224
///
1225
///    The comparison yields 0 for false, 1 for true. If either of the two lower
1226
///    double-precision values is NaN, 1 is returned.
1227
///
1228
/// \headerfile <x86intrin.h>
1229
///
1230
/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1231
///
1232
/// \param __a
1233
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1234
///    compared to the lower double-precision value of \a __b.
1235
/// \param __b
1236
///    A 128-bit vector of [2 x double]. The lower double-precision value is
1237
///    compared to the lower double-precision value of \a __a.
1238
/// \returns An integer containing the comparison result. If either of the two
1239
///    lower double-precision values is NaN, 1 is returned.
1240
static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1241
                                                         __m128d __b) {
1242
  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1243
}
1244
 
1245
/// Converts the two double-precision floating-point elements of a
1246
///    128-bit vector of [2 x double] into two single-precision floating-point
1247
///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1248
///    The upper 64 bits of the result vector are set to zero.
1249
///
1250
/// \headerfile <x86intrin.h>
1251
///
1252
/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1253
///
1254
/// \param __a
1255
///    A 128-bit vector of [2 x double].
1256
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1257
///    converted values. The upper 64 bits are set to zero.
1258
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1259
  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1260
}
1261
 
1262
/// Converts the lower two single-precision floating-point elements of a
1263
///    128-bit vector of [4 x float] into two double-precision floating-point
1264
///    values, returned in a 128-bit vector of [2 x double]. The upper two
1265
///    elements of the input vector are unused.
1266
///
1267
/// \headerfile <x86intrin.h>
1268
///
1269
/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1270
///
1271
/// \param __a
1272
///    A 128-bit vector of [4 x float]. The lower two single-precision
1273
///    floating-point elements are converted to double-precision values. The
1274
///    upper two elements are unused.
1275
/// \returns A 128-bit vector of [2 x double] containing the converted values.
1276
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1277
  return (__m128d) __builtin_convertvector(
1278
      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1279
}
1280
 
1281
/// Converts the lower two integer elements of a 128-bit vector of
1282
///    [4 x i32] into two double-precision floating-point values, returned in a
1283
///    128-bit vector of [2 x double].
1284
///
1285
///    The upper two elements of the input vector are unused.
1286
///
1287
/// \headerfile <x86intrin.h>
1288
///
1289
/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1290
///
1291
/// \param __a
1292
///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1293
///    converted to double-precision values.
1294
///
1295
///    The upper two elements are unused.
1296
/// \returns A 128-bit vector of [2 x double] containing the converted values.
1297
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1298
  return (__m128d) __builtin_convertvector(
1299
      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1300
}
1301
 
1302
/// Converts the two double-precision floating-point elements of a
1303
///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1304
///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1305
///    64 bits of the result vector are set to zero.
1306
///
1307
/// \headerfile <x86intrin.h>
1308
///
1309
/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1310
///
1311
/// \param __a
1312
///    A 128-bit vector of [2 x double].
1313
/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1314
///    converted values. The upper 64 bits are set to zero.
1315
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1316
  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1317
}
1318
 
1319
/// Converts the low-order element of a 128-bit vector of [2 x double]
1320
///    into a 32-bit signed integer value.
1321
///
1322
/// \headerfile <x86intrin.h>
1323
///
1324
/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1325
///
1326
/// \param __a
1327
///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1328
///    conversion.
1329
/// \returns A 32-bit signed integer containing the converted value.
1330
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1331
  return __builtin_ia32_cvtsd2si((__v2df)__a);
1332
}
1333
 
1334
/// Converts the lower double-precision floating-point element of a
1335
///    128-bit vector of [2 x double], in the second parameter, into a
1336
///    single-precision floating-point value, returned in the lower 32 bits of a
1337
///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1338
///    copied from the upper 96 bits of the first parameter.
1339
///
1340
/// \headerfile <x86intrin.h>
1341
///
1342
/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1343
///
1344
/// \param __a
1345
///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1346
///    copied to the upper 96 bits of the result.
1347
/// \param __b
1348
///    A 128-bit vector of [2 x double]. The lower double-precision
1349
///    floating-point element is used in the conversion.
1350
/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1351
///    converted value from the second parameter. The upper 96 bits are copied
1352
///    from the upper 96 bits of the first parameter.
1353
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1354
                                                         __m128d __b) {
1355
  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1356
}
1357
 
1358
/// Converts a 32-bit signed integer value, in the second parameter, into
1359
///    a double-precision floating-point value, returned in the lower 64 bits of
1360
///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1361
///    are copied from the upper 64 bits of the first parameter.
1362
///
1363
/// \headerfile <x86intrin.h>
1364
///
1365
/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1366
///
1367
/// \param __a
1368
///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1369
///    copied to the upper 64 bits of the result.
1370
/// \param __b
1371
///    A 32-bit signed integer containing the value to be converted.
1372
/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1373
///    converted value from the second parameter. The upper 64 bits are copied
1374
///    from the upper 64 bits of the first parameter.
1375
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1376
                                                            int __b) {
1377
  __a[0] = __b;
1378
  return __a;
1379
}
1380
 
1381
/// Converts the lower single-precision floating-point element of a
1382
///    128-bit vector of [4 x float], in the second parameter, into a
1383
///    double-precision floating-point value, returned in the lower 64 bits of
1384
///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1385
///    are copied from the upper 64 bits of the first parameter.
1386
///
1387
/// \headerfile <x86intrin.h>
1388
///
1389
/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1390
///
1391
/// \param __a
1392
///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1393
///    copied to the upper 64 bits of the result.
1394
/// \param __b
1395
///    A 128-bit vector of [4 x float]. The lower single-precision
1396
///    floating-point element is used in the conversion.
1397
/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1398
///    converted value from the second parameter. The upper 64 bits are copied
1399
///    from the upper 64 bits of the first parameter.
1400
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1401
                                                          __m128 __b) {
1402
  __a[0] = __b[0];
1403
  return __a;
1404
}
1405
 
1406
/// Converts the two double-precision floating-point elements of a
1407
///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1408
///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1409
///
1410
///    If the result of either conversion is inexact, the result is truncated
1411
///    (rounded towards zero) regardless of the current MXCSR setting. The upper
1412
///    64 bits of the result vector are set to zero.
1413
///
1414
/// \headerfile <x86intrin.h>
1415
///
1416
/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1417
///   instruction.
1418
///
1419
/// \param __a
1420
///    A 128-bit vector of [2 x double].
1421
/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1422
///    converted values. The upper 64 bits are set to zero.
1423
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1424
  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1425
}
1426
 
1427
/// Converts the low-order element of a [2 x double] vector into a 32-bit
1428
///    signed integer value, truncating the result when it is inexact.
1429
///
1430
/// \headerfile <x86intrin.h>
1431
///
1432
/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1433
///   instruction.
1434
///
1435
/// \param __a
1436
///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1437
///    conversion.
1438
/// \returns A 32-bit signed integer containing the converted value.
1439
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1440
  return __builtin_ia32_cvttsd2si((__v2df)__a);
1441
}
1442
 
1443
/// Converts the two double-precision floating-point elements of a
1444
///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1445
///    returned in a 64-bit vector of [2 x i32].
1446
///
1447
/// \headerfile <x86intrin.h>
1448
///
1449
/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1450
///
1451
/// \param __a
1452
///    A 128-bit vector of [2 x double].
1453
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1454
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1455
  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1456
}
1457
 
1458
/// Converts the two double-precision floating-point elements of a
1459
///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1460
///    returned in a 64-bit vector of [2 x i32].
1461
///
1462
///    If the result of either conversion is inexact, the result is truncated
1463
///    (rounded towards zero) regardless of the current MXCSR setting.
1464
///
1465
/// \headerfile <x86intrin.h>
1466
///
1467
/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1468
///
1469
/// \param __a
1470
///    A 128-bit vector of [2 x double].
1471
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1472
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1473
  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1474
}
1475
 
1476
/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1477
///    [2 x i32] into two double-precision floating-point values, returned in a
1478
///    128-bit vector of [2 x double].
1479
///
1480
/// \headerfile <x86intrin.h>
1481
///
1482
/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1483
///
1484
/// \param __a
1485
///    A 64-bit vector of [2 x i32].
1486
/// \returns A 128-bit vector of [2 x double] containing the converted values.
1487
static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1488
  return __builtin_ia32_cvtpi2pd((__v2si)__a);
1489
}
1490
 
1491
/// Returns the low-order element of a 128-bit vector of [2 x double] as
1492
///    a double-precision floating-point value.
1493
///
1494
/// \headerfile <x86intrin.h>
1495
///
1496
/// This intrinsic has no corresponding instruction.
1497
///
1498
/// \param __a
1499
///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1500
/// \returns A double-precision floating-point value copied from the lower 64
1501
///    bits of \a __a.
1502
static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1503
  return __a[0];
1504
}
1505
 
1506
/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1507
///    memory location.
1508
///
1509
/// \headerfile <x86intrin.h>
1510
///
1511
/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1512
///
1513
/// \param __dp
1514
///    A pointer to a 128-bit memory location. The address of the memory
1515
///    location has to be 16-byte aligned.
1516
/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1517
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1518
  return *(const __m128d *)__dp;
1519
}
1520
 
1521
/// Loads a double-precision floating-point value from a specified memory
1522
///    location and duplicates it to both vector elements of a 128-bit vector of
1523
///    [2 x double].
1524
///
1525
/// \headerfile <x86intrin.h>
1526
///
1527
/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1528
///
1529
/// \param __dp
1530
///    A pointer to a memory location containing a double-precision value.
1531
/// \returns A 128-bit vector of [2 x double] containing the loaded and
1532
///    duplicated values.
1533
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1534
  struct __mm_load1_pd_struct {
1535
    double __u;
1536
  } __attribute__((__packed__, __may_alias__));
1537
  double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1538
  return __extension__(__m128d){__u, __u};
1539
}
1540
 
1541
#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1542
 
1543
/// Loads two double-precision values, in reverse order, from an aligned
1544
///    memory location into a 128-bit vector of [2 x double].
1545
///
1546
/// \headerfile <x86intrin.h>
1547
///
1548
/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1549
/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1550
/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1551
///
1552
/// \param __dp
1553
///    A 16-byte aligned pointer to an array of double-precision values to be
1554
///    loaded in reverse order.
1555
/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1556
///    values.
1557
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1558
  __m128d __u = *(const __m128d *)__dp;
1559
  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1560
}
1561
 
1562
/// Loads a 128-bit floating-point vector of [2 x double] from an
1563
///    unaligned memory location.
1564
///
1565
/// \headerfile <x86intrin.h>
1566
///
1567
/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1568
///
1569
/// \param __dp
1570
///    A pointer to a 128-bit memory location. The address of the memory
1571
///    location does not have to be aligned.
1572
/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1573
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1574
  struct __loadu_pd {
1575
    __m128d_u __v;
1576
  } __attribute__((__packed__, __may_alias__));
1577
  return ((const struct __loadu_pd *)__dp)->__v;
1578
}
1579
 
1580
/// Loads a 64-bit integer value to the low element of a 128-bit integer
1581
///    vector and clears the upper element.
1582
///
1583
/// \headerfile <x86intrin.h>
1584
///
1585
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1586
///
1587
/// \param __a
1588
///    A pointer to a 64-bit memory location. The address of the memory
1589
///    location does not have to be aligned.
1590
/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1591
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1592
  struct __loadu_si64 {
1593
    long long __v;
1594
  } __attribute__((__packed__, __may_alias__));
1595
  long long __u = ((const struct __loadu_si64 *)__a)->__v;
1596
  return __extension__(__m128i)(__v2di){__u, 0LL};
1597
}
1598
 
1599
/// Loads a 32-bit integer value to the low element of a 128-bit integer
1600
///    vector and clears the upper element.
1601
///
1602
/// \headerfile <x86intrin.h>
1603
///
1604
/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1605
///
1606
/// \param __a
1607
///    A pointer to a 32-bit memory location. The address of the memory
1608
///    location does not have to be aligned.
1609
/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1610
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1611
  struct __loadu_si32 {
1612
    int __v;
1613
  } __attribute__((__packed__, __may_alias__));
1614
  int __u = ((const struct __loadu_si32 *)__a)->__v;
1615
  return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1616
}
1617
 
1618
/// Loads a 16-bit integer value to the low element of a 128-bit integer
1619
///    vector and clears the upper element.
1620
///
1621
/// \headerfile <x86intrin.h>
1622
///
1623
/// This intrinsic does not correspond to a specific instruction.
1624
///
1625
/// \param __a
1626
///    A pointer to a 16-bit memory location. The address of the memory
1627
///    location does not have to be aligned.
1628
/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1629
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1630
  struct __loadu_si16 {
1631
    short __v;
1632
  } __attribute__((__packed__, __may_alias__));
1633
  short __u = ((const struct __loadu_si16 *)__a)->__v;
1634
  return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1635
}
1636
 
1637
/// Loads a 64-bit double-precision value to the low element of a
1638
///    128-bit integer vector and clears the upper element.
1639
///
1640
/// \headerfile <x86intrin.h>
1641
///
1642
/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1643
///
1644
/// \param __dp
1645
///    A pointer to a memory location containing a double-precision value.
1646
///    The address of the memory location does not have to be aligned.
1647
/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1648
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1649
  struct __mm_load_sd_struct {
1650
    double __u;
1651
  } __attribute__((__packed__, __may_alias__));
1652
  double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1653
  return __extension__(__m128d){__u, 0};
1654
}
1655
 
1656
/// Loads a double-precision value into the high-order bits of a 128-bit
1657
///    vector of [2 x double]. The low-order bits are copied from the low-order
1658
///    bits of the first operand.
1659
///
1660
/// \headerfile <x86intrin.h>
1661
///
1662
/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1663
///
1664
/// \param __a
1665
///    A 128-bit vector of [2 x double]. \n
1666
///    Bits [63:0] are written to bits [63:0] of the result.
1667
/// \param __dp
1668
///    A pointer to a 64-bit memory location containing a double-precision
1669
///    floating-point value that is loaded. The loaded value is written to bits
1670
///    [127:64] of the result. The address of the memory location does not have
1671
///    to be aligned.
1672
/// \returns A 128-bit vector of [2 x double] containing the moved values.
1673
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1674
                                                          double const *__dp) {
1675
  struct __mm_loadh_pd_struct {
1676
    double __u;
1677
  } __attribute__((__packed__, __may_alias__));
1678
  double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1679
  return __extension__(__m128d){__a[0], __u};
1680
}
1681
 
1682
/// Loads a double-precision value into the low-order bits of a 128-bit
1683
///    vector of [2 x double]. The high-order bits are copied from the
1684
///    high-order bits of the first operand.
1685
///
1686
/// \headerfile <x86intrin.h>
1687
///
1688
/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1689
///
1690
/// \param __a
1691
///    A 128-bit vector of [2 x double]. \n
1692
///    Bits [127:64] are written to bits [127:64] of the result.
1693
/// \param __dp
1694
///    A pointer to a 64-bit memory location containing a double-precision
1695
///    floating-point value that is loaded. The loaded value is written to bits
1696
///    [63:0] of the result. The address of the memory location does not have to
1697
///    be aligned.
1698
/// \returns A 128-bit vector of [2 x double] containing the moved values.
1699
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1700
                                                          double const *__dp) {
1701
  struct __mm_loadl_pd_struct {
1702
    double __u;
1703
  } __attribute__((__packed__, __may_alias__));
1704
  double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1705
  return __extension__(__m128d){__u, __a[1]};
1706
}
1707
 
1708
/// Constructs a 128-bit floating-point vector of [2 x double] with
1709
///    unspecified content. This could be used as an argument to another
1710
///    intrinsic function where the argument is required but the value is not
1711
///    actually used.
1712
///
1713
/// \headerfile <x86intrin.h>
1714
///
1715
/// This intrinsic has no corresponding instruction.
1716
///
1717
/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1718
///    content.
1719
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1720
  return (__m128d)__builtin_ia32_undef128();
1721
}
1722
 
1723
/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1724
///    64 bits of the vector are initialized with the specified double-precision
1725
///    floating-point value. The upper 64 bits are set to zero.
1726
///
1727
/// \headerfile <x86intrin.h>
1728
///
1729
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1730
///
1731
/// \param __w
1732
///    A double-precision floating-point value used to initialize the lower 64
1733
///    bits of the result.
1734
/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1735
///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1736
///    set to zero.
1737
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1738
  return __extension__(__m128d){__w, 0};
1739
}
1740
 
1741
/// Constructs a 128-bit floating-point vector of [2 x double], with each
1742
///    of the two double-precision floating-point vector elements set to the
1743
///    specified double-precision floating-point value.
1744
///
1745
/// \headerfile <x86intrin.h>
1746
///
1747
/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1748
///
1749
/// \param __w
1750
///    A double-precision floating-point value used to initialize each vector
1751
///    element of the result.
1752
/// \returns An initialized 128-bit floating-point vector of [2 x double].
1753
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1754
  return __extension__(__m128d){__w, __w};
1755
}
1756
 
1757
/// Constructs a 128-bit floating-point vector of [2 x double], with each
1758
///    of the two double-precision floating-point vector elements set to the
1759
///    specified double-precision floating-point value.
1760
///
1761
/// \headerfile <x86intrin.h>
1762
///
1763
/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1764
///
1765
/// \param __w
1766
///    A double-precision floating-point value used to initialize each vector
1767
///    element of the result.
1768
/// \returns An initialized 128-bit floating-point vector of [2 x double].
1769
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1770
  return _mm_set1_pd(__w);
1771
}
1772
 
1773
/// Constructs a 128-bit floating-point vector of [2 x double]
1774
///    initialized with the specified double-precision floating-point values.
1775
///
1776
/// \headerfile <x86intrin.h>
1777
///
1778
/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1779
///
1780
/// \param __w
1781
///    A double-precision floating-point value used to initialize the upper 64
1782
///    bits of the result.
1783
/// \param __x
1784
///    A double-precision floating-point value used to initialize the lower 64
1785
///    bits of the result.
1786
/// \returns An initialized 128-bit floating-point vector of [2 x double].
1787
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1788
                                                        double __x) {
1789
  return __extension__(__m128d){__x, __w};
1790
}
1791
 
1792
/// Constructs a 128-bit floating-point vector of [2 x double],
1793
///    initialized in reverse order with the specified double-precision
1794
///    floating-point values.
1795
///
1796
/// \headerfile <x86intrin.h>
1797
///
1798
/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1799
///
1800
/// \param __w
1801
///    A double-precision floating-point value used to initialize the lower 64
1802
///    bits of the result.
1803
/// \param __x
1804
///    A double-precision floating-point value used to initialize the upper 64
1805
///    bits of the result.
1806
/// \returns An initialized 128-bit floating-point vector of [2 x double].
1807
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1808
                                                         double __x) {
1809
  return __extension__(__m128d){__w, __x};
1810
}
1811
 
1812
/// Constructs a 128-bit floating-point vector of [2 x double]
1813
///    initialized to zero.
1814
///
1815
/// \headerfile <x86intrin.h>
1816
///
1817
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1818
///
1819
/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1820
///    all elements set to zero.
1821
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1822
  return __extension__(__m128d){0.0, 0.0};
1823
}
1824
 
1825
/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1826
///    64 bits are set to the lower 64 bits of the second parameter. The upper
1827
///    64 bits are set to the upper 64 bits of the first parameter.
1828
///
1829
/// \headerfile <x86intrin.h>
1830
///
1831
/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1832
///
1833
/// \param __a
1834
///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1835
///    upper 64 bits of the result.
1836
/// \param __b
1837
///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1838
///    lower 64 bits of the result.
1839
/// \returns A 128-bit vector of [2 x double] containing the moved values.
1840
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1841
                                                         __m128d __b) {
1842
  __a[0] = __b[0];
1843
  return __a;
1844
}
1845
 
1846
/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1847
///    memory location.
1848
///
1849
/// \headerfile <x86intrin.h>
1850
///
1851
/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1852
///
1853
/// \param __dp
1854
///    A pointer to a 64-bit memory location.
1855
/// \param __a
1856
///    A 128-bit vector of [2 x double] containing the value to be stored.
1857
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1858
                                                       __m128d __a) {
1859
  struct __mm_store_sd_struct {
1860
    double __u;
1861
  } __attribute__((__packed__, __may_alias__));
1862
  ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1863
}
1864
 
1865
/// Moves packed double-precision values from a 128-bit vector of
1866
///    [2 x double] to a memory location.
1867
///
1868
/// \headerfile <x86intrin.h>
1869
///
1870
/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1871
///
1872
/// \param __dp
1873
///    A pointer to an aligned memory location that can store two
1874
///    double-precision values.
1875
/// \param __a
1876
///    A packed 128-bit vector of [2 x double] containing the values to be
1877
///    moved.
1878
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1879
                                                       __m128d __a) {
1880
  *(__m128d *)__dp = __a;
1881
}
1882
 
1883
/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1884
///    the upper and lower 64 bits of a memory location.
1885
///
1886
/// \headerfile <x86intrin.h>
1887
///
1888
/// This intrinsic corresponds to the
1889
///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1890
///
1891
/// \param __dp
1892
///    A pointer to a memory location that can store two double-precision
1893
///    values.
1894
/// \param __a
1895
///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1896
///    of the values in \a __dp.
1897
static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1898
                                                        __m128d __a) {
1899
  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1900
  _mm_store_pd(__dp, __a);
1901
}
1902
 
1903
/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1904
///    the upper and lower 64 bits of a memory location.
1905
///
1906
/// \headerfile <x86intrin.h>
1907
///
1908
/// This intrinsic corresponds to the
1909
///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1910
///
1911
/// \param __dp
1912
///    A pointer to a memory location that can store two double-precision
1913
///    values.
1914
/// \param __a
1915
///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1916
///    of the values in \a __dp.
1917
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1918
                                                        __m128d __a) {
1919
  _mm_store1_pd(__dp, __a);
1920
}
1921
 
1922
/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1923
///    location.
1924
///
1925
/// \headerfile <x86intrin.h>
1926
///
1927
/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1928
///
1929
/// \param __dp
1930
///    A pointer to a 128-bit memory location. The address of the memory
1931
///    location does not have to be aligned.
1932
/// \param __a
1933
///    A 128-bit vector of [2 x double] containing the values to be stored.
1934
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1935
                                                        __m128d __a) {
1936
  struct __storeu_pd {
1937
    __m128d_u __v;
1938
  } __attribute__((__packed__, __may_alias__));
1939
  ((struct __storeu_pd *)__dp)->__v = __a;
1940
}
1941
 
1942
/// Stores two double-precision values, in reverse order, from a 128-bit
1943
///    vector of [2 x double] to a 16-byte aligned memory location.
1944
///
1945
/// \headerfile <x86intrin.h>
1946
///
1947
/// This intrinsic corresponds to a shuffling instruction followed by a
1948
/// <c> VMOVAPD / MOVAPD </c> instruction.
1949
///
1950
/// \param __dp
1951
///    A pointer to a 16-byte aligned memory location that can store two
1952
///    double-precision values.
1953
/// \param __a
1954
///    A 128-bit vector of [2 x double] containing the values to be reversed and
1955
///    stored.
1956
static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1957
                                                        __m128d __a) {
1958
  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1959
  *(__m128d *)__dp = __a;
1960
}
1961
 
1962
/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1963
///    memory location.
1964
///
1965
/// \headerfile <x86intrin.h>
1966
///
1967
/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1968
///
1969
/// \param __dp
1970
///    A pointer to a 64-bit memory location.
1971
/// \param __a
1972
///    A 128-bit vector of [2 x double] containing the value to be stored.
1973
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1974
                                                        __m128d __a) {
1975
  struct __mm_storeh_pd_struct {
1976
    double __u;
1977
  } __attribute__((__packed__, __may_alias__));
1978
  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1979
}
1980
 
1981
/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1982
///    memory location.
1983
///
1984
/// \headerfile <x86intrin.h>
1985
///
1986
/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1987
///
1988
/// \param __dp
1989
///    A pointer to a 64-bit memory location.
1990
/// \param __a
1991
///    A 128-bit vector of [2 x double] containing the value to be stored.
1992
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1993
                                                        __m128d __a) {
1994
  struct __mm_storeh_pd_struct {
1995
    double __u;
1996
  } __attribute__((__packed__, __may_alias__));
1997
  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1998
}
1999
 
2000
/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2001
///    saving the lower 8 bits of each sum in the corresponding element of a
2002
///    128-bit result vector of [16 x i8].
2003
///
2004
///    The integer elements of both parameters can be either signed or unsigned.
2005
///
2006
/// \headerfile <x86intrin.h>
2007
///
2008
/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2009
///
2010
/// \param __a
2011
///    A 128-bit vector of [16 x i8].
2012
/// \param __b
2013
///    A 128-bit vector of [16 x i8].
2014
/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2015
///    parameters.
2016
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2017
                                                          __m128i __b) {
2018
  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2019
}
2020
 
2021
/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2022
///    saving the lower 16 bits of each sum in the corresponding element of a
2023
///    128-bit result vector of [8 x i16].
2024
///
2025
///    The integer elements of both parameters can be either signed or unsigned.
2026
///
2027
/// \headerfile <x86intrin.h>
2028
///
2029
/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2030
///
2031
/// \param __a
2032
///    A 128-bit vector of [8 x i16].
2033
/// \param __b
2034
///    A 128-bit vector of [8 x i16].
2035
/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2036
///    parameters.
2037
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2038
                                                           __m128i __b) {
2039
  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2040
}
2041
 
2042
/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2043
///    saving the lower 32 bits of each sum in the corresponding element of a
2044
///    128-bit result vector of [4 x i32].
2045
///
2046
///    The integer elements of both parameters can be either signed or unsigned.
2047
///
2048
/// \headerfile <x86intrin.h>
2049
///
2050
/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2051
///
2052
/// \param __a
2053
///    A 128-bit vector of [4 x i32].
2054
/// \param __b
2055
///    A 128-bit vector of [4 x i32].
2056
/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2057
///    parameters.
2058
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2059
                                                           __m128i __b) {
2060
  return (__m128i)((__v4su)__a + (__v4su)__b);
2061
}
2062
 
2063
/// Adds two signed or unsigned 64-bit integer values, returning the
2064
///    lower 64 bits of the sum.
2065
///
2066
/// \headerfile <x86intrin.h>
2067
///
2068
/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2069
///
2070
/// \param __a
2071
///    A 64-bit integer.
2072
/// \param __b
2073
///    A 64-bit integer.
2074
/// \returns A 64-bit integer containing the sum of both parameters.
2075
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2076
                                                            __m64 __b) {
2077
  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2078
}
2079
 
2080
/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2081
///    saving the lower 64 bits of each sum in the corresponding element of a
2082
///    128-bit result vector of [2 x i64].
2083
///
2084
///    The integer elements of both parameters can be either signed or unsigned.
2085
///
2086
/// \headerfile <x86intrin.h>
2087
///
2088
/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2089
///
2090
/// \param __a
2091
///    A 128-bit vector of [2 x i64].
2092
/// \param __b
2093
///    A 128-bit vector of [2 x i64].
2094
/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2095
///    parameters.
2096
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2097
                                                           __m128i __b) {
2098
  return (__m128i)((__v2du)__a + (__v2du)__b);
2099
}
2100
 
2101
/// Adds, with saturation, the corresponding elements of two 128-bit
2102
///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2103
///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2104
///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2105
///
2106
/// \headerfile <x86intrin.h>
2107
///
2108
/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2109
///
2110
/// \param __a
2111
///    A 128-bit signed [16 x i8] vector.
2112
/// \param __b
2113
///    A 128-bit signed [16 x i8] vector.
2114
/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2115
///    both parameters.
2116
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2117
                                                           __m128i __b) {
2118
  return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2119
}
2120
 
2121
/// Adds, with saturation, the corresponding elements of two 128-bit
2122
///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2123
///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2124
///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2125
///    0x8000.
2126
///
2127
/// \headerfile <x86intrin.h>
2128
///
2129
/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2130
///
2131
/// \param __a
2132
///    A 128-bit signed [8 x i16] vector.
2133
/// \param __b
2134
///    A 128-bit signed [8 x i16] vector.
2135
/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2136
///    both parameters.
2137
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2138
                                                            __m128i __b) {
2139
  return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2140
}
2141
 
2142
/// Adds, with saturation, the corresponding elements of two 128-bit
2143
///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2144
///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2145
///    are saturated to 0xFF. Negative sums are saturated to 0x00.
2146
///
2147
/// \headerfile <x86intrin.h>
2148
///
2149
/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2150
///
2151
/// \param __a
2152
///    A 128-bit unsigned [16 x i8] vector.
2153
/// \param __b
2154
///    A 128-bit unsigned [16 x i8] vector.
2155
/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2156
///    of both parameters.
2157
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2158
                                                           __m128i __b) {
2159
  return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2160
}
2161
 
2162
/// Adds, with saturation, the corresponding elements of two 128-bit
2163
///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2164
///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
2165
///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2166
///
2167
/// \headerfile <x86intrin.h>
2168
///
2169
/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2170
///
2171
/// \param __a
2172
///    A 128-bit unsigned [8 x i16] vector.
2173
/// \param __b
2174
///    A 128-bit unsigned [8 x i16] vector.
2175
/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2176
///    of both parameters.
2177
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2178
                                                            __m128i __b) {
2179
  return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2180
}
2181
 
2182
/// Computes the rounded averages of corresponding elements of two
2183
///    128-bit unsigned [16 x i8] vectors, saving each result in the
2184
///    corresponding element of a 128-bit result vector of [16 x i8].
2185
///
2186
/// \headerfile <x86intrin.h>
2187
///
2188
/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2189
///
2190
/// \param __a
2191
///    A 128-bit unsigned [16 x i8] vector.
2192
/// \param __b
2193
///    A 128-bit unsigned [16 x i8] vector.
2194
/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2195
///    averages of both parameters.
2196
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2197
                                                          __m128i __b) {
2198
  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2199
}
2200
 
2201
/// Computes the rounded averages of corresponding elements of two
2202
///    128-bit unsigned [8 x i16] vectors, saving each result in the
2203
///    corresponding element of a 128-bit result vector of [8 x i16].
2204
///
2205
/// \headerfile <x86intrin.h>
2206
///
2207
/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2208
///
2209
/// \param __a
2210
///    A 128-bit unsigned [8 x i16] vector.
2211
/// \param __b
2212
///    A 128-bit unsigned [8 x i16] vector.
2213
/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2214
///    averages of both parameters.
2215
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2216
                                                           __m128i __b) {
2217
  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2218
}
2219
 
2220
/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2221
///    vectors, producing eight intermediate 32-bit signed integer products, and
2222
///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2223
///    [4 x i32] vector.
2224
///
2225
///    For example, bits [15:0] of both parameters are multiplied producing a
2226
///    32-bit product, bits [31:16] of both parameters are multiplied producing
2227
///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2228
///    of the result.
2229
///
2230
/// \headerfile <x86intrin.h>
2231
///
2232
/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2233
///
2234
/// \param __a
2235
///    A 128-bit signed [8 x i16] vector.
2236
/// \param __b
2237
///    A 128-bit signed [8 x i16] vector.
2238
/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2239
///    of both parameters.
2240
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2241
                                                            __m128i __b) {
2242
  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2243
}
2244
 
2245
/// Compares corresponding elements of two 128-bit signed [8 x i16]
2246
///    vectors, saving the greater value from each comparison in the
2247
///    corresponding element of a 128-bit result vector of [8 x i16].
2248
///
2249
/// \headerfile <x86intrin.h>
2250
///
2251
/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2252
///
2253
/// \param __a
2254
///    A 128-bit signed [8 x i16] vector.
2255
/// \param __b
2256
///    A 128-bit signed [8 x i16] vector.
2257
/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2258
///    each comparison.
2259
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2260
                                                           __m128i __b) {
2261
  return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2262
}
2263
 
2264
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2265
///    vectors, saving the greater value from each comparison in the
2266
///    corresponding element of a 128-bit result vector of [16 x i8].
2267
///
2268
/// \headerfile <x86intrin.h>
2269
///
2270
/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2271
///
2272
/// \param __a
2273
///    A 128-bit unsigned [16 x i8] vector.
2274
/// \param __b
2275
///    A 128-bit unsigned [16 x i8] vector.
2276
/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2277
///    each comparison.
2278
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2279
                                                          __m128i __b) {
2280
  return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2281
}
2282
 
2283
/// Compares corresponding elements of two 128-bit signed [8 x i16]
2284
///    vectors, saving the smaller value from each comparison in the
2285
///    corresponding element of a 128-bit result vector of [8 x i16].
2286
///
2287
/// \headerfile <x86intrin.h>
2288
///
2289
/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2290
///
2291
/// \param __a
2292
///    A 128-bit signed [8 x i16] vector.
2293
/// \param __b
2294
///    A 128-bit signed [8 x i16] vector.
2295
/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2296
///    each comparison.
2297
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2298
                                                           __m128i __b) {
2299
  return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2300
}
2301
 
2302
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2303
///    vectors, saving the smaller value from each comparison in the
2304
///    corresponding element of a 128-bit result vector of [16 x i8].
2305
///
2306
/// \headerfile <x86intrin.h>
2307
///
2308
/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2309
///
2310
/// \param __a
2311
///    A 128-bit unsigned [16 x i8] vector.
2312
/// \param __b
2313
///    A 128-bit unsigned [16 x i8] vector.
2314
/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2315
///    each comparison.
2316
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2317
                                                          __m128i __b) {
2318
  return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2319
}
2320
 
2321
/// Multiplies the corresponding elements of two signed [8 x i16]
2322
///    vectors, saving the upper 16 bits of each 32-bit product in the
2323
///    corresponding element of a 128-bit signed [8 x i16] result vector.
2324
///
2325
/// \headerfile <x86intrin.h>
2326
///
2327
/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2328
///
2329
/// \param __a
2330
///    A 128-bit signed [8 x i16] vector.
2331
/// \param __b
2332
///    A 128-bit signed [8 x i16] vector.
2333
/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2334
///    each of the eight 32-bit products.
2335
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2336
                                                             __m128i __b) {
2337
  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2338
}
2339
 
2340
/// Multiplies the corresponding elements of two unsigned [8 x i16]
2341
///    vectors, saving the upper 16 bits of each 32-bit product in the
2342
///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2343
///
2344
/// \headerfile <x86intrin.h>
2345
///
2346
/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2347
///
2348
/// \param __a
2349
///    A 128-bit unsigned [8 x i16] vector.
2350
/// \param __b
2351
///    A 128-bit unsigned [8 x i16] vector.
2352
/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2353
///    of each of the eight 32-bit products.
2354
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2355
                                                             __m128i __b) {
2356
  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2357
}
2358
 
2359
/// Multiplies the corresponding elements of two signed [8 x i16]
2360
///    vectors, saving the lower 16 bits of each 32-bit product in the
2361
///    corresponding element of a 128-bit signed [8 x i16] result vector.
2362
///
2363
/// \headerfile <x86intrin.h>
2364
///
2365
/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2366
///
2367
/// \param __a
2368
///    A 128-bit signed [8 x i16] vector.
2369
/// \param __b
2370
///    A 128-bit signed [8 x i16] vector.
2371
/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2372
///    each of the eight 32-bit products.
2373
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2374
                                                             __m128i __b) {
2375
  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2376
}
2377
 
2378
/// Multiplies 32-bit unsigned integer values contained in the lower bits
2379
///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2380
///    product.
2381
///
2382
/// \headerfile <x86intrin.h>
2383
///
2384
/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2385
///
2386
/// \param __a
2387
///    A 64-bit integer containing one of the source operands.
2388
/// \param __b
2389
///    A 64-bit integer containing one of the source operands.
2390
/// \returns A 64-bit integer vector containing the product of both operands.
2391
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2392
                                                            __m64 __b) {
2393
  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2394
}
2395
 
2396
/// Multiplies 32-bit unsigned integer values contained in the lower
2397
///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2398
///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2399
///
2400
/// \headerfile <x86intrin.h>
2401
///
2402
/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2403
///
2404
/// \param __a
2405
///    A [2 x i64] vector containing one of the source operands.
2406
/// \param __b
2407
///    A [2 x i64] vector containing one of the source operands.
2408
/// \returns A [2 x i64] vector containing the product of both operands.
2409
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2410
                                                           __m128i __b) {
2411
  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2412
}
2413
 
2414
/// Computes the absolute differences of corresponding 8-bit integer
2415
///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2416
///    separately sums the second 8 absolute differences. Packs these two
2417
///    unsigned 16-bit integer sums into the upper and lower elements of a
2418
///    [2 x i64] vector.
2419
///
2420
/// \headerfile <x86intrin.h>
2421
///
2422
/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2423
///
2424
/// \param __a
2425
///    A 128-bit integer vector containing one of the source operands.
2426
/// \param __b
2427
///    A 128-bit integer vector containing one of the source operands.
2428
/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2429
///    differences between both operands.
2430
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2431
                                                          __m128i __b) {
2432
  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2433
}
2434
 
2435
/// Subtracts the corresponding 8-bit integer values in the operands.
2436
///
2437
/// \headerfile <x86intrin.h>
2438
///
2439
/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2440
///
2441
/// \param __a
2442
///    A 128-bit integer vector containing the minuends.
2443
/// \param __b
2444
///    A 128-bit integer vector containing the subtrahends.
2445
/// \returns A 128-bit integer vector containing the differences of the values
2446
///    in the operands.
2447
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2448
                                                          __m128i __b) {
2449
  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2450
}
2451
 
2452
/// Subtracts the corresponding 16-bit integer values in the operands.
2453
///
2454
/// \headerfile <x86intrin.h>
2455
///
2456
/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2457
///
2458
/// \param __a
2459
///    A 128-bit integer vector containing the minuends.
2460
/// \param __b
2461
///    A 128-bit integer vector containing the subtrahends.
2462
/// \returns A 128-bit integer vector containing the differences of the values
2463
///    in the operands.
2464
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2465
                                                           __m128i __b) {
2466
  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2467
}
2468
 
2469
/// Subtracts the corresponding 32-bit integer values in the operands.
2470
///
2471
/// \headerfile <x86intrin.h>
2472
///
2473
/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2474
///
2475
/// \param __a
2476
///    A 128-bit integer vector containing the minuends.
2477
/// \param __b
2478
///    A 128-bit integer vector containing the subtrahends.
2479
/// \returns A 128-bit integer vector containing the differences of the values
2480
///    in the operands.
2481
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2482
                                                           __m128i __b) {
2483
  return (__m128i)((__v4su)__a - (__v4su)__b);
2484
}
2485
 
2486
/// Subtracts signed or unsigned 64-bit integer values and writes the
2487
///    difference to the corresponding bits in the destination.
2488
///
2489
/// \headerfile <x86intrin.h>
2490
///
2491
/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2492
///
2493
/// \param __a
2494
///    A 64-bit integer vector containing the minuend.
2495
/// \param __b
2496
///    A 64-bit integer vector containing the subtrahend.
2497
/// \returns A 64-bit integer vector containing the difference of the values in
2498
///    the operands.
2499
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2500
                                                            __m64 __b) {
2501
  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2502
}
2503
 
2504
/// Subtracts the corresponding elements of two [2 x i64] vectors.
2505
///
2506
/// \headerfile <x86intrin.h>
2507
///
2508
/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2509
///
2510
/// \param __a
2511
///    A 128-bit integer vector containing the minuends.
2512
/// \param __b
2513
///    A 128-bit integer vector containing the subtrahends.
2514
/// \returns A 128-bit integer vector containing the differences of the values
2515
///    in the operands.
2516
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2517
                                                           __m128i __b) {
2518
  return (__m128i)((__v2du)__a - (__v2du)__b);
2519
}
2520
 
2521
/// Subtracts corresponding 8-bit signed integer values in the input and
2522
///    returns the differences in the corresponding bytes in the destination.
2523
///    Differences greater than 0x7F are saturated to 0x7F, and differences less
2524
///    than 0x80 are saturated to 0x80.
2525
///
2526
/// \headerfile <x86intrin.h>
2527
///
2528
/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2529
///
2530
/// \param __a
2531
///    A 128-bit integer vector containing the minuends.
2532
/// \param __b
2533
///    A 128-bit integer vector containing the subtrahends.
2534
/// \returns A 128-bit integer vector containing the differences of the values
2535
///    in the operands.
2536
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2537
                                                           __m128i __b) {
2538
  return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2539
}
2540
 
2541
/// Subtracts corresponding 16-bit signed integer values in the input and
2542
///    returns the differences in the corresponding bytes in the destination.
2543
///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2544
///    than 0x8000 are saturated to 0x8000.
2545
///
2546
/// \headerfile <x86intrin.h>
2547
///
2548
/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2549
///
2550
/// \param __a
2551
///    A 128-bit integer vector containing the minuends.
2552
/// \param __b
2553
///    A 128-bit integer vector containing the subtrahends.
2554
/// \returns A 128-bit integer vector containing the differences of the values
2555
///    in the operands.
2556
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2557
                                                            __m128i __b) {
2558
  return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2559
}
2560
 
2561
/// Subtracts corresponding 8-bit unsigned integer values in the input
2562
///    and returns the differences in the corresponding bytes in the
2563
///    destination. Differences less than 0x00 are saturated to 0x00.
2564
///
2565
/// \headerfile <x86intrin.h>
2566
///
2567
/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2568
///
2569
/// \param __a
2570
///    A 128-bit integer vector containing the minuends.
2571
/// \param __b
2572
///    A 128-bit integer vector containing the subtrahends.
2573
/// \returns A 128-bit integer vector containing the unsigned integer
2574
///    differences of the values in the operands.
2575
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2576
                                                           __m128i __b) {
2577
  return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2578
}
2579
 
2580
/// Subtracts corresponding 16-bit unsigned integer values in the input
2581
///    and returns the differences in the corresponding bytes in the
2582
///    destination. Differences less than 0x0000 are saturated to 0x0000.
2583
///
2584
/// \headerfile <x86intrin.h>
2585
///
2586
/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2587
///
2588
/// \param __a
2589
///    A 128-bit integer vector containing the minuends.
2590
/// \param __b
2591
///    A 128-bit integer vector containing the subtrahends.
2592
/// \returns A 128-bit integer vector containing the unsigned integer
2593
///    differences of the values in the operands.
2594
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2595
                                                            __m128i __b) {
2596
  return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2597
}
2598
 
2599
/// Performs a bitwise AND of two 128-bit integer vectors.
2600
///
2601
/// \headerfile <x86intrin.h>
2602
///
2603
/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2604
///
2605
/// \param __a
2606
///    A 128-bit integer vector containing one of the source operands.
2607
/// \param __b
2608
///    A 128-bit integer vector containing one of the source operands.
2609
/// \returns A 128-bit integer vector containing the bitwise AND of the values
2610
///    in both operands.
2611
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2612
                                                           __m128i __b) {
2613
  return (__m128i)((__v2du)__a & (__v2du)__b);
2614
}
2615
 
2616
/// Performs a bitwise AND of two 128-bit integer vectors, using the
2617
///    one's complement of the values contained in the first source operand.
2618
///
2619
/// \headerfile <x86intrin.h>
2620
///
2621
/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2622
///
2623
/// \param __a
2624
///    A 128-bit vector containing the left source operand. The one's complement
2625
///    of this value is used in the bitwise AND.
2626
/// \param __b
2627
///    A 128-bit vector containing the right source operand.
2628
/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2629
///    complement of the first operand and the values in the second operand.
2630
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2631
                                                              __m128i __b) {
2632
  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2633
}
2634
/// Performs a bitwise OR of two 128-bit integer vectors.
2635
///
2636
/// \headerfile <x86intrin.h>
2637
///
2638
/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2639
///
2640
/// \param __a
2641
///    A 128-bit integer vector containing one of the source operands.
2642
/// \param __b
2643
///    A 128-bit integer vector containing one of the source operands.
2644
/// \returns A 128-bit integer vector containing the bitwise OR of the values
2645
///    in both operands.
2646
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2647
                                                          __m128i __b) {
2648
  return (__m128i)((__v2du)__a | (__v2du)__b);
2649
}
2650
 
2651
/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2652
///
2653
/// \headerfile <x86intrin.h>
2654
///
2655
/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2656
///
2657
/// \param __a
2658
///    A 128-bit integer vector containing one of the source operands.
2659
/// \param __b
2660
///    A 128-bit integer vector containing one of the source operands.
2661
/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2662
///    values in both operands.
2663
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2664
                                                           __m128i __b) {
2665
  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2666
}
2667
 
2668
/// Left-shifts the 128-bit integer vector operand by the specified
2669
///    number of bytes. Low-order bits are cleared.
2670
///
2671
/// \headerfile <x86intrin.h>
2672
///
2673
/// \code
2674
/// __m128i _mm_slli_si128(__m128i a, const int imm);
2675
/// \endcode
2676
///
2677
/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2678
///
2679
/// \param a
2680
///    A 128-bit integer vector containing the source operand.
2681
/// \param imm
2682
///    An immediate value specifying the number of bytes to left-shift operand
2683
///    \a a.
2684
/// \returns A 128-bit integer vector containing the left-shifted value.
2685
#define _mm_slli_si128(a, imm)                                                 \
2686
  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2687
                                                (int)(imm)))
2688
 
2689
#define _mm_bslli_si128(a, imm)                                                \
2690
  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2691
                                                (int)(imm)))
2692
 
2693
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2694
///    by the specified number of bits. Low-order bits are cleared.
2695
///
2696
/// \headerfile <x86intrin.h>
2697
///
2698
/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2699
///
2700
/// \param __a
2701
///    A 128-bit integer vector containing the source operand.
2702
/// \param __count
2703
///    An integer value specifying the number of bits to left-shift each value
2704
///    in operand \a __a.
2705
/// \returns A 128-bit integer vector containing the left-shifted values.
2706
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2707
                                                            int __count) {
2708
  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2709
}
2710
 
2711
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2712
///    by the specified number of bits. Low-order bits are cleared.
2713
///
2714
/// \headerfile <x86intrin.h>
2715
///
2716
/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2717
///
2718
/// \param __a
2719
///    A 128-bit integer vector containing the source operand.
2720
/// \param __count
2721
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2722
///    to left-shift each value in operand \a __a.
2723
/// \returns A 128-bit integer vector containing the left-shifted values.
2724
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2725
                                                           __m128i __count) {
2726
  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2727
}
2728
 
2729
/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2730
///    by the specified number of bits. Low-order bits are cleared.
2731
///
2732
/// \headerfile <x86intrin.h>
2733
///
2734
/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2735
///
2736
/// \param __a
2737
///    A 128-bit integer vector containing the source operand.
2738
/// \param __count
2739
///    An integer value specifying the number of bits to left-shift each value
2740
///    in operand \a __a.
2741
/// \returns A 128-bit integer vector containing the left-shifted values.
2742
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2743
                                                            int __count) {
2744
  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2745
}
2746
 
2747
/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2748
///    by the specified number of bits. Low-order bits are cleared.
2749
///
2750
/// \headerfile <x86intrin.h>
2751
///
2752
/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2753
///
2754
/// \param __a
2755
///    A 128-bit integer vector containing the source operand.
2756
/// \param __count
2757
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2758
///    to left-shift each value in operand \a __a.
2759
/// \returns A 128-bit integer vector containing the left-shifted values.
2760
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2761
                                                           __m128i __count) {
2762
  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2763
}
2764
 
2765
/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2766
///    by the specified number of bits. Low-order bits are cleared.
2767
///
2768
/// \headerfile <x86intrin.h>
2769
///
2770
/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2771
///
2772
/// \param __a
2773
///    A 128-bit integer vector containing the source operand.
2774
/// \param __count
2775
///    An integer value specifying the number of bits to left-shift each value
2776
///    in operand \a __a.
2777
/// \returns A 128-bit integer vector containing the left-shifted values.
2778
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2779
                                                            int __count) {
2780
  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2781
}
2782
 
2783
/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2784
///    by the specified number of bits. Low-order bits are cleared.
2785
///
2786
/// \headerfile <x86intrin.h>
2787
///
2788
/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2789
///
2790
/// \param __a
2791
///    A 128-bit integer vector containing the source operand.
2792
/// \param __count
2793
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2794
///    to left-shift each value in operand \a __a.
2795
/// \returns A 128-bit integer vector containing the left-shifted values.
2796
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2797
                                                           __m128i __count) {
2798
  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2799
}
2800
 
2801
/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2802
///    by the specified number of bits. High-order bits are filled with the sign
2803
///    bit of the initial value.
2804
///
2805
/// \headerfile <x86intrin.h>
2806
///
2807
/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2808
///
2809
/// \param __a
2810
///    A 128-bit integer vector containing the source operand.
2811
/// \param __count
2812
///    An integer value specifying the number of bits to right-shift each value
2813
///    in operand \a __a.
2814
/// \returns A 128-bit integer vector containing the right-shifted values.
2815
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2816
                                                            int __count) {
2817
  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2818
}
2819
 
2820
/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2821
///    by the specified number of bits. High-order bits are filled with the sign
2822
///    bit of the initial value.
2823
///
2824
/// \headerfile <x86intrin.h>
2825
///
2826
/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2827
///
2828
/// \param __a
2829
///    A 128-bit integer vector containing the source operand.
2830
/// \param __count
2831
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2832
///    to right-shift each value in operand \a __a.
2833
/// \returns A 128-bit integer vector containing the right-shifted values.
2834
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2835
                                                           __m128i __count) {
2836
  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2837
}
2838
 
2839
/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2840
///    by the specified number of bits. High-order bits are filled with the sign
2841
///    bit of the initial value.
2842
///
2843
/// \headerfile <x86intrin.h>
2844
///
2845
/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2846
///
2847
/// \param __a
2848
///    A 128-bit integer vector containing the source operand.
2849
/// \param __count
2850
///    An integer value specifying the number of bits to right-shift each value
2851
///    in operand \a __a.
2852
/// \returns A 128-bit integer vector containing the right-shifted values.
2853
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2854
                                                            int __count) {
2855
  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2856
}
2857
 
2858
/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2859
///    by the specified number of bits. High-order bits are filled with the sign
2860
///    bit of the initial value.
2861
///
2862
/// \headerfile <x86intrin.h>
2863
///
2864
/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2865
///
2866
/// \param __a
2867
///    A 128-bit integer vector containing the source operand.
2868
/// \param __count
2869
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2870
///    to right-shift each value in operand \a __a.
2871
/// \returns A 128-bit integer vector containing the right-shifted values.
2872
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2873
                                                           __m128i __count) {
2874
  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2875
}
2876
 
2877
/// Right-shifts the 128-bit integer vector operand by the specified
2878
///    number of bytes. High-order bits are cleared.
2879
///
2880
/// \headerfile <x86intrin.h>
2881
///
2882
/// \code
2883
/// __m128i _mm_srli_si128(__m128i a, const int imm);
2884
/// \endcode
2885
///
2886
/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2887
///
2888
/// \param a
2889
///    A 128-bit integer vector containing the source operand.
2890
/// \param imm
2891
///    An immediate value specifying the number of bytes to right-shift operand
2892
///    \a a.
2893
/// \returns A 128-bit integer vector containing the right-shifted value.
2894
#define _mm_srli_si128(a, imm)                                                 \
2895
  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2896
                                                (int)(imm)))
2897
 
2898
#define _mm_bsrli_si128(a, imm)                                                \
2899
  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2900
                                                (int)(imm)))
2901
 
2902
/// Right-shifts each of 16-bit values in the 128-bit integer vector
2903
///    operand by the specified number of bits. High-order bits are cleared.
2904
///
2905
/// \headerfile <x86intrin.h>
2906
///
2907
/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2908
///
2909
/// \param __a
2910
///    A 128-bit integer vector containing the source operand.
2911
/// \param __count
2912
///    An integer value specifying the number of bits to right-shift each value
2913
///    in operand \a __a.
2914
/// \returns A 128-bit integer vector containing the right-shifted values.
2915
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2916
                                                            int __count) {
2917
  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2918
}
2919
 
2920
/// Right-shifts each of 16-bit values in the 128-bit integer vector
2921
///    operand by the specified number of bits. High-order bits are cleared.
2922
///
2923
/// \headerfile <x86intrin.h>
2924
///
2925
/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2926
///
2927
/// \param __a
2928
///    A 128-bit integer vector containing the source operand.
2929
/// \param __count
2930
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2931
///    to right-shift each value in operand \a __a.
2932
/// \returns A 128-bit integer vector containing the right-shifted values.
2933
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2934
                                                           __m128i __count) {
2935
  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2936
}
2937
 
2938
/// Right-shifts each of 32-bit values in the 128-bit integer vector
2939
///    operand by the specified number of bits. High-order bits are cleared.
2940
///
2941
/// \headerfile <x86intrin.h>
2942
///
2943
/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2944
///
2945
/// \param __a
2946
///    A 128-bit integer vector containing the source operand.
2947
/// \param __count
2948
///    An integer value specifying the number of bits to right-shift each value
2949
///    in operand \a __a.
2950
/// \returns A 128-bit integer vector containing the right-shifted values.
2951
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2952
                                                            int __count) {
2953
  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2954
}
2955
 
2956
/// Right-shifts each of 32-bit values in the 128-bit integer vector
2957
///    operand by the specified number of bits. High-order bits are cleared.
2958
///
2959
/// \headerfile <x86intrin.h>
2960
///
2961
/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2962
///
2963
/// \param __a
2964
///    A 128-bit integer vector containing the source operand.
2965
/// \param __count
2966
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2967
///    to right-shift each value in operand \a __a.
2968
/// \returns A 128-bit integer vector containing the right-shifted values.
2969
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2970
                                                           __m128i __count) {
2971
  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2972
}
2973
 
2974
/// Right-shifts each of 64-bit values in the 128-bit integer vector
2975
///    operand by the specified number of bits. High-order bits are cleared.
2976
///
2977
/// \headerfile <x86intrin.h>
2978
///
2979
/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2980
///
2981
/// \param __a
2982
///    A 128-bit integer vector containing the source operand.
2983
/// \param __count
2984
///    An integer value specifying the number of bits to right-shift each value
2985
///    in operand \a __a.
2986
/// \returns A 128-bit integer vector containing the right-shifted values.
2987
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2988
                                                            int __count) {
2989
  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2990
}
2991
 
2992
/// Right-shifts each of 64-bit values in the 128-bit integer vector
2993
///    operand by the specified number of bits. High-order bits are cleared.
2994
///
2995
/// \headerfile <x86intrin.h>
2996
///
2997
/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2998
///
2999
/// \param __a
3000
///    A 128-bit integer vector containing the source operand.
3001
/// \param __count
3002
///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3003
///    to right-shift each value in operand \a __a.
3004
/// \returns A 128-bit integer vector containing the right-shifted values.
3005
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3006
                                                           __m128i __count) {
3007
  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3008
}
3009
 
3010
/// Compares each of the corresponding 8-bit values of the 128-bit
3011
///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3012
///    for true.
3013
///
3014
/// \headerfile <x86intrin.h>
3015
///
3016
/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3017
///
3018
/// \param __a
3019
///    A 128-bit integer vector.
3020
/// \param __b
3021
///    A 128-bit integer vector.
3022
/// \returns A 128-bit integer vector containing the comparison results.
3023
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3024
                                                            __m128i __b) {
3025
  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3026
}
3027
 
3028
/// Compares each of the corresponding 16-bit values of the 128-bit
3029
///    integer vectors for equality. Each comparison yields 0x0 for false,
3030
///    0xFFFF for true.
3031
///
3032
/// \headerfile <x86intrin.h>
3033
///
3034
/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3035
///
3036
/// \param __a
3037
///    A 128-bit integer vector.
3038
/// \param __b
3039
///    A 128-bit integer vector.
3040
/// \returns A 128-bit integer vector containing the comparison results.
3041
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3042
                                                             __m128i __b) {
3043
  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3044
}
3045
 
3046
/// Compares each of the corresponding 32-bit values of the 128-bit
3047
///    integer vectors for equality. Each comparison yields 0x0 for false,
3048
///    0xFFFFFFFF for true.
3049
///
3050
/// \headerfile <x86intrin.h>
3051
///
3052
/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3053
///
3054
/// \param __a
3055
///    A 128-bit integer vector.
3056
/// \param __b
3057
///    A 128-bit integer vector.
3058
/// \returns A 128-bit integer vector containing the comparison results.
3059
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3060
                                                             __m128i __b) {
3061
  return (__m128i)((__v4si)__a == (__v4si)__b);
3062
}
3063
 
3064
/// Compares each of the corresponding signed 8-bit values of the 128-bit
3065
///    integer vectors to determine if the values in the first operand are
3066
///    greater than those in the second operand. Each comparison yields 0x0 for
3067
///    false, 0xFF for true.
3068
///
3069
/// \headerfile <x86intrin.h>
3070
///
3071
/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3072
///
3073
/// \param __a
3074
///    A 128-bit integer vector.
3075
/// \param __b
3076
///    A 128-bit integer vector.
3077
/// \returns A 128-bit integer vector containing the comparison results.
3078
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3079
                                                            __m128i __b) {
3080
  /* This function always performs a signed comparison, but __v16qi is a char
3081
     which may be signed or unsigned, so use __v16qs. */
3082
  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3083
}
3084
 
3085
/// Compares each of the corresponding signed 16-bit values of the
3086
///    128-bit integer vectors to determine if the values in the first operand
3087
///    are greater than those in the second operand.
3088
///
3089
///    Each comparison yields 0x0 for false, 0xFFFF for true.
3090
///
3091
/// \headerfile <x86intrin.h>
3092
///
3093
/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3094
///
3095
/// \param __a
3096
///    A 128-bit integer vector.
3097
/// \param __b
3098
///    A 128-bit integer vector.
3099
/// \returns A 128-bit integer vector containing the comparison results.
3100
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3101
                                                             __m128i __b) {
3102
  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3103
}
3104
 
3105
/// Compares each of the corresponding signed 32-bit values of the
3106
///    128-bit integer vectors to determine if the values in the first operand
3107
///    are greater than those in the second operand.
3108
///
3109
///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3110
///
3111
/// \headerfile <x86intrin.h>
3112
///
3113
/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3114
///
3115
/// \param __a
3116
///    A 128-bit integer vector.
3117
/// \param __b
3118
///    A 128-bit integer vector.
3119
/// \returns A 128-bit integer vector containing the comparison results.
3120
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3121
                                                             __m128i __b) {
3122
  return (__m128i)((__v4si)__a > (__v4si)__b);
3123
}
3124
 
3125
/// Compares each of the corresponding signed 8-bit values of the 128-bit
3126
///    integer vectors to determine if the values in the first operand are less
3127
///    than those in the second operand.
3128
///
3129
///    Each comparison yields 0x0 for false, 0xFF for true.
3130
///
3131
/// \headerfile <x86intrin.h>
3132
///
3133
/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3134
///
3135
/// \param __a
3136
///    A 128-bit integer vector.
3137
/// \param __b
3138
///    A 128-bit integer vector.
3139
/// \returns A 128-bit integer vector containing the comparison results.
3140
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3141
                                                            __m128i __b) {
3142
  return _mm_cmpgt_epi8(__b, __a);
3143
}
3144
 
3145
/// Compares each of the corresponding signed 16-bit values of the
3146
///    128-bit integer vectors to determine if the values in the first operand
3147
///    are less than those in the second operand.
3148
///
3149
///    Each comparison yields 0x0 for false, 0xFFFF for true.
3150
///
3151
/// \headerfile <x86intrin.h>
3152
///
3153
/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3154
///
3155
/// \param __a
3156
///    A 128-bit integer vector.
3157
/// \param __b
3158
///    A 128-bit integer vector.
3159
/// \returns A 128-bit integer vector containing the comparison results.
3160
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3161
                                                             __m128i __b) {
3162
  return _mm_cmpgt_epi16(__b, __a);
3163
}
3164
 
3165
/// Compares each of the corresponding signed 32-bit values of the
3166
///    128-bit integer vectors to determine if the values in the first operand
3167
///    are less than those in the second operand.
3168
///
3169
///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3170
///
3171
/// \headerfile <x86intrin.h>
3172
///
3173
/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3174
///
3175
/// \param __a
3176
///    A 128-bit integer vector.
3177
/// \param __b
3178
///    A 128-bit integer vector.
3179
/// \returns A 128-bit integer vector containing the comparison results.
3180
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3181
                                                             __m128i __b) {
3182
  return _mm_cmpgt_epi32(__b, __a);
3183
}
3184
 
3185
#ifdef __x86_64__
3186
/// Converts a 64-bit signed integer value from the second operand into a
3187
///    double-precision value and returns it in the lower element of a [2 x
3188
///    double] vector; the upper element of the returned vector is copied from
3189
///    the upper element of the first operand.
3190
///
3191
/// \headerfile <x86intrin.h>
3192
///
3193
/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3194
///
3195
/// \param __a
3196
///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3197
///    copied to the upper 64 bits of the destination.
3198
/// \param __b
3199
///    A 64-bit signed integer operand containing the value to be converted.
3200
/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3201
///    converted value of the second operand. The upper 64 bits are copied from
3202
///    the upper 64 bits of the first operand.
3203
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3204
                                                            long long __b) {
3205
  __a[0] = __b;
3206
  return __a;
3207
}
3208
 
3209
/// Converts the first (lower) element of a vector of [2 x double] into a
3210
///    64-bit signed integer value, according to the current rounding mode.
3211
///
3212
/// \headerfile <x86intrin.h>
3213
///
3214
/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3215
///
3216
/// \param __a
3217
///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3218
///    conversion.
3219
/// \returns A 64-bit signed integer containing the converted value.
3220
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3221
  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3222
}
3223
 
3224
/// Converts the first (lower) element of a vector of [2 x double] into a
3225
///    64-bit signed integer value, truncating the result when it is inexact.
3226
///
3227
/// \headerfile <x86intrin.h>
3228
///
3229
/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3230
///   instruction.
3231
///
3232
/// \param __a
3233
///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3234
///    conversion.
3235
/// \returns A 64-bit signed integer containing the converted value.
3236
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3237
  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3238
}
3239
#endif
3240
 
3241
/// Converts a vector of [4 x i32] into a vector of [4 x float].
3242
///
3243
/// \headerfile <x86intrin.h>
3244
///
3245
/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3246
///
3247
/// \param __a
3248
///    A 128-bit integer vector.
3249
/// \returns A 128-bit vector of [4 x float] containing the converted values.
3250
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3251
  return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3252
}
3253
 
3254
/// Converts a vector of [4 x float] into a vector of [4 x i32].
3255
///
3256
/// \headerfile <x86intrin.h>
3257
///
3258
/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3259
///
3260
/// \param __a
3261
///    A 128-bit vector of [4 x float].
3262
/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3263
///    values.
3264
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3265
  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3266
}
3267
 
3268
/// Converts a vector of [4 x float] into a vector of [4 x i32],
3269
///    truncating the result when it is inexact.
3270
///
3271
/// \headerfile <x86intrin.h>
3272
///
3273
/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3274
///   instruction.
3275
///
3276
/// \param __a
3277
///    A 128-bit vector of [4 x float].
3278
/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3279
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3280
  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3281
}
3282
 
3283
/// Returns a vector of [4 x i32] where the lowest element is the input
3284
///    operand and the remaining elements are zero.
3285
///
3286
/// \headerfile <x86intrin.h>
3287
///
3288
/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3289
///
3290
/// \param __a
3291
///    A 32-bit signed integer operand.
3292
/// \returns A 128-bit vector of [4 x i32].
3293
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3294
  return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3295
}
3296
 
3297
/// Returns a vector of [2 x i64] where the lower element is the input
3298
///    operand and the upper element is zero.
3299
///
3300
/// \headerfile <x86intrin.h>
3301
///
3302
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3303
/// in 64-bit mode.
3304
///
3305
/// \param __a
3306
///    A 64-bit signed integer operand containing the value to be converted.
3307
/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3308
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3309
  return __extension__(__m128i)(__v2di){__a, 0};
3310
}
3311
 
3312
/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3313
///    32-bit signed integer value.
3314
///
3315
/// \headerfile <x86intrin.h>
3316
///
3317
/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3318
///
3319
/// \param __a
3320
///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3321
///    destination.
3322
/// \returns A 32-bit signed integer containing the moved value.
3323
static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3324
  __v4si __b = (__v4si)__a;
3325
  return __b[0];
3326
}
3327
 
3328
/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3329
///    64-bit signed integer value.
3330
///
3331
/// \headerfile <x86intrin.h>
3332
///
3333
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3334
///
3335
/// \param __a
3336
///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3337
///    destination.
3338
/// \returns A 64-bit signed integer containing the moved value.
3339
static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3340
  return __a[0];
3341
}
3342
 
3343
/// Moves packed integer values from an aligned 128-bit memory location
3344
///    to elements in a 128-bit integer vector.
3345
///
3346
/// \headerfile <x86intrin.h>
3347
///
3348
/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3349
///
3350
/// \param __p
3351
///    An aligned pointer to a memory location containing integer values.
3352
/// \returns A 128-bit integer vector containing the moved values.
3353
static __inline__ __m128i __DEFAULT_FN_ATTRS
3354
_mm_load_si128(__m128i const *__p) {
3355
  return *__p;
3356
}
3357
 
3358
/// Moves packed integer values from an unaligned 128-bit memory location
3359
///    to elements in a 128-bit integer vector.
3360
///
3361
/// \headerfile <x86intrin.h>
3362
///
3363
/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3364
///
3365
/// \param __p
3366
///    A pointer to a memory location containing integer values.
3367
/// \returns A 128-bit integer vector containing the moved values.
3368
static __inline__ __m128i __DEFAULT_FN_ATTRS
3369
_mm_loadu_si128(__m128i_u const *__p) {
3370
  struct __loadu_si128 {
3371
    __m128i_u __v;
3372
  } __attribute__((__packed__, __may_alias__));
3373
  return ((const struct __loadu_si128 *)__p)->__v;
3374
}
3375
 
3376
/// Returns a vector of [2 x i64] where the lower element is taken from
3377
///    the lower element of the operand, and the upper element is zero.
3378
///
3379
/// \headerfile <x86intrin.h>
3380
///
3381
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3382
///
3383
/// \param __p
3384
///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3385
///    the destination.
3386
/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3387
///    moved value. The higher order bits are cleared.
3388
static __inline__ __m128i __DEFAULT_FN_ATTRS
3389
_mm_loadl_epi64(__m128i_u const *__p) {
3390
  struct __mm_loadl_epi64_struct {
3391
    long long __u;
3392
  } __attribute__((__packed__, __may_alias__));
3393
  return __extension__(__m128i){
3394
      ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3395
}
3396
 
3397
/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3398
///    This could be used as an argument to another intrinsic function where the
3399
///    argument is required but the value is not actually used.
3400
///
3401
/// \headerfile <x86intrin.h>
3402
///
3403
/// This intrinsic has no corresponding instruction.
3404
///
3405
/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3406
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3407
  return (__m128i)__builtin_ia32_undef128();
3408
}
3409
 
3410
/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3411
///    the specified 64-bit integer values.
3412
///
3413
/// \headerfile <x86intrin.h>
3414
///
3415
/// This intrinsic is a utility function and does not correspond to a specific
3416
///    instruction.
3417
///
3418
/// \param __q1
3419
///    A 64-bit integer value used to initialize the upper 64 bits of the
3420
///    destination vector of [2 x i64].
3421
/// \param __q0
3422
///    A 64-bit integer value used to initialize the lower 64 bits of the
3423
///    destination vector of [2 x i64].
3424
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3425
///    provided in the operands.
3426
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3427
                                                            long long __q0) {
3428
  return __extension__(__m128i)(__v2di){__q0, __q1};
3429
}
3430
 
3431
/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3432
///    the specified 64-bit integer values.
3433
///
3434
/// \headerfile <x86intrin.h>
3435
///
3436
/// This intrinsic is a utility function and does not correspond to a specific
3437
///    instruction.
3438
///
3439
/// \param __q1
3440
///    A 64-bit integer value used to initialize the upper 64 bits of the
3441
///    destination vector of [2 x i64].
3442
/// \param __q0
3443
///    A 64-bit integer value used to initialize the lower 64 bits of the
3444
///    destination vector of [2 x i64].
3445
/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3446
///    provided in the operands.
3447
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3448
                                                           __m64 __q0) {
3449
  return _mm_set_epi64x((long long)__q1, (long long)__q0);
3450
}
3451
 
3452
/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3453
///    the specified 32-bit integer values.
3454
///
3455
/// \headerfile <x86intrin.h>
3456
///
3457
/// This intrinsic is a utility function and does not correspond to a specific
3458
///    instruction.
3459
///
3460
/// \param __i3
3461
///    A 32-bit integer value used to initialize bits [127:96] of the
3462
///    destination vector.
3463
/// \param __i2
3464
///    A 32-bit integer value used to initialize bits [95:64] of the destination
3465
///    vector.
3466
/// \param __i1
3467
///    A 32-bit integer value used to initialize bits [63:32] of the destination
3468
///    vector.
3469
/// \param __i0
3470
///    A 32-bit integer value used to initialize bits [31:0] of the destination
3471
///    vector.
3472
/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3473
///    provided in the operands.
3474
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3475
                                                           int __i1, int __i0) {
3476
  return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3477
}
3478
 
3479
/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3480
///    the specified 16-bit integer values.
3481
///
3482
/// \headerfile <x86intrin.h>
3483
///
3484
/// This intrinsic is a utility function and does not correspond to a specific
3485
///    instruction.
3486
///
3487
/// \param __w7
3488
///    A 16-bit integer value used to initialize bits [127:112] of the
3489
///    destination vector.
3490
/// \param __w6
3491
///    A 16-bit integer value used to initialize bits [111:96] of the
3492
///    destination vector.
3493
/// \param __w5
3494
///    A 16-bit integer value used to initialize bits [95:80] of the destination
3495
///    vector.
3496
/// \param __w4
3497
///    A 16-bit integer value used to initialize bits [79:64] of the destination
3498
///    vector.
3499
/// \param __w3
3500
///    A 16-bit integer value used to initialize bits [63:48] of the destination
3501
///    vector.
3502
/// \param __w2
3503
///    A 16-bit integer value used to initialize bits [47:32] of the destination
3504
///    vector.
3505
/// \param __w1
3506
///    A 16-bit integer value used to initialize bits [31:16] of the destination
3507
///    vector.
3508
/// \param __w0
3509
///    A 16-bit integer value used to initialize bits [15:0] of the destination
3510
///    vector.
3511
/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3512
///    provided in the operands.
3513
static __inline__ __m128i __DEFAULT_FN_ATTRS
3514
_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3515
              short __w2, short __w1, short __w0) {
3516
  return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3517
                                        __w4, __w5, __w6, __w7};
3518
}
3519
 
3520
/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3521
///    the specified 8-bit integer values.
3522
///
3523
/// \headerfile <x86intrin.h>
3524
///
3525
/// This intrinsic is a utility function and does not correspond to a specific
3526
///    instruction.
3527
///
3528
/// \param __b15
3529
///    Initializes bits [127:120] of the destination vector.
3530
/// \param __b14
3531
///    Initializes bits [119:112] of the destination vector.
3532
/// \param __b13
3533
///    Initializes bits [111:104] of the destination vector.
3534
/// \param __b12
3535
///    Initializes bits [103:96] of the destination vector.
3536
/// \param __b11
3537
///    Initializes bits [95:88] of the destination vector.
3538
/// \param __b10
3539
///    Initializes bits [87:80] of the destination vector.
3540
/// \param __b9
3541
///    Initializes bits [79:72] of the destination vector.
3542
/// \param __b8
3543
///    Initializes bits [71:64] of the destination vector.
3544
/// \param __b7
3545
///    Initializes bits [63:56] of the destination vector.
3546
/// \param __b6
3547
///    Initializes bits [55:48] of the destination vector.
3548
/// \param __b5
3549
///    Initializes bits [47:40] of the destination vector.
3550
/// \param __b4
3551
///    Initializes bits [39:32] of the destination vector.
3552
/// \param __b3
3553
///    Initializes bits [31:24] of the destination vector.
3554
/// \param __b2
3555
///    Initializes bits [23:16] of the destination vector.
3556
/// \param __b1
3557
///    Initializes bits [15:8] of the destination vector.
3558
/// \param __b0
3559
///    Initializes bits [7:0] of the destination vector.
3560
/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3561
///    provided in the operands.
3562
static __inline__ __m128i __DEFAULT_FN_ATTRS
3563
_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3564
             char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3565
             char __b4, char __b3, char __b2, char __b1, char __b0) {
3566
  return __extension__(__m128i)(__v16qi){
3567
      __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3568
      __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3569
}
3570
 
3571
/// Initializes both values in a 128-bit integer vector with the
3572
///    specified 64-bit integer value.
3573
///
3574
/// \headerfile <x86intrin.h>
3575
///
3576
/// This intrinsic is a utility function and does not correspond to a specific
3577
///    instruction.
3578
///
3579
/// \param __q
3580
///    Integer value used to initialize the elements of the destination integer
3581
///    vector.
3582
/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3583
///    elements containing the value provided in the operand.
3584
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3585
  return _mm_set_epi64x(__q, __q);
3586
}
3587
 
3588
/// Initializes both values in a 128-bit vector of [2 x i64] with the
3589
///    specified 64-bit value.
3590
///
3591
/// \headerfile <x86intrin.h>
3592
///
3593
/// This intrinsic is a utility function and does not correspond to a specific
3594
///    instruction.
3595
///
3596
/// \param __q
3597
///    A 64-bit value used to initialize the elements of the destination integer
3598
///    vector.
3599
/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3600
///    containing the value provided in the operand.
3601
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3602
  return _mm_set_epi64(__q, __q);
3603
}
3604
 
3605
/// Initializes all values in a 128-bit vector of [4 x i32] with the
3606
///    specified 32-bit value.
3607
///
3608
/// \headerfile <x86intrin.h>
3609
///
3610
/// This intrinsic is a utility function and does not correspond to a specific
3611
///    instruction.
3612
///
3613
/// \param __i
3614
///    A 32-bit value used to initialize the elements of the destination integer
3615
///    vector.
3616
/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3617
///    containing the value provided in the operand.
3618
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3619
  return _mm_set_epi32(__i, __i, __i, __i);
3620
}
3621
 
3622
/// Initializes all values in a 128-bit vector of [8 x i16] with the
3623
///    specified 16-bit value.
3624
///
3625
/// \headerfile <x86intrin.h>
3626
///
3627
/// This intrinsic is a utility function and does not correspond to a specific
3628
///    instruction.
3629
///
3630
/// \param __w
3631
///    A 16-bit value used to initialize the elements of the destination integer
3632
///    vector.
3633
/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3634
///    containing the value provided in the operand.
3635
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3636
  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3637
}
3638
 
3639
/// Initializes all values in a 128-bit vector of [16 x i8] with the
3640
///    specified 8-bit value.
3641
///
3642
/// \headerfile <x86intrin.h>
3643
///
3644
/// This intrinsic is a utility function and does not correspond to a specific
3645
///    instruction.
3646
///
3647
/// \param __b
3648
///    An 8-bit value used to initialize the elements of the destination integer
3649
///    vector.
3650
/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3651
///    containing the value provided in the operand.
3652
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3653
  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3654
                      __b, __b, __b, __b, __b);
3655
}
3656
 
3657
/// Constructs a 128-bit integer vector, initialized in reverse order
3658
///     with the specified 64-bit integral values.
3659
///
3660
/// \headerfile <x86intrin.h>
3661
///
3662
/// This intrinsic does not correspond to a specific instruction.
3663
///
3664
/// \param __q0
3665
///    A 64-bit integral value used to initialize the lower 64 bits of the
3666
///    result.
3667
/// \param __q1
3668
///    A 64-bit integral value used to initialize the upper 64 bits of the
3669
///    result.
3670
/// \returns An initialized 128-bit integer vector.
3671
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3672
                                                            __m64 __q1) {
3673
  return _mm_set_epi64(__q1, __q0);
3674
}
3675
 
3676
/// Constructs a 128-bit integer vector, initialized in reverse order
3677
///     with the specified 32-bit integral values.
3678
///
3679
/// \headerfile <x86intrin.h>
3680
///
3681
/// This intrinsic is a utility function and does not correspond to a specific
3682
///    instruction.
3683
///
3684
/// \param __i0
3685
///    A 32-bit integral value used to initialize bits [31:0] of the result.
3686
/// \param __i1
3687
///    A 32-bit integral value used to initialize bits [63:32] of the result.
3688
/// \param __i2
3689
///    A 32-bit integral value used to initialize bits [95:64] of the result.
3690
/// \param __i3
3691
///    A 32-bit integral value used to initialize bits [127:96] of the result.
3692
/// \returns An initialized 128-bit integer vector.
3693
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3694
                                                            int __i2,
3695
                                                            int __i3) {
3696
  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3697
}
3698
 
3699
/// Constructs a 128-bit integer vector, initialized in reverse order
3700
///     with the specified 16-bit integral values.
3701
///
3702
/// \headerfile <x86intrin.h>
3703
///
3704
/// This intrinsic is a utility function and does not correspond to a specific
3705
///    instruction.
3706
///
3707
/// \param __w0
3708
///    A 16-bit integral value used to initialize bits [15:0] of the result.
3709
/// \param __w1
3710
///    A 16-bit integral value used to initialize bits [31:16] of the result.
3711
/// \param __w2
3712
///    A 16-bit integral value used to initialize bits [47:32] of the result.
3713
/// \param __w3
3714
///    A 16-bit integral value used to initialize bits [63:48] of the result.
3715
/// \param __w4
3716
///    A 16-bit integral value used to initialize bits [79:64] of the result.
3717
/// \param __w5
3718
///    A 16-bit integral value used to initialize bits [95:80] of the result.
3719
/// \param __w6
3720
///    A 16-bit integral value used to initialize bits [111:96] of the result.
3721
/// \param __w7
3722
///    A 16-bit integral value used to initialize bits [127:112] of the result.
3723
/// \returns An initialized 128-bit integer vector.
3724
static __inline__ __m128i __DEFAULT_FN_ATTRS
3725
_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3726
               short __w5, short __w6, short __w7) {
3727
  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3728
}
3729
 
3730
/// Constructs a 128-bit integer vector, initialized in reverse order
3731
///     with the specified 8-bit integral values.
3732
///
3733
/// \headerfile <x86intrin.h>
3734
///
3735
/// This intrinsic is a utility function and does not correspond to a specific
3736
///    instruction.
3737
///
3738
/// \param __b0
3739
///    An 8-bit integral value used to initialize bits [7:0] of the result.
3740
/// \param __b1
3741
///    An 8-bit integral value used to initialize bits [15:8] of the result.
3742
/// \param __b2
3743
///    An 8-bit integral value used to initialize bits [23:16] of the result.
3744
/// \param __b3
3745
///    An 8-bit integral value used to initialize bits [31:24] of the result.
3746
/// \param __b4
3747
///    An 8-bit integral value used to initialize bits [39:32] of the result.
3748
/// \param __b5
3749
///    An 8-bit integral value used to initialize bits [47:40] of the result.
3750
/// \param __b6
3751
///    An 8-bit integral value used to initialize bits [55:48] of the result.
3752
/// \param __b7
3753
///    An 8-bit integral value used to initialize bits [63:56] of the result.
3754
/// \param __b8
3755
///    An 8-bit integral value used to initialize bits [71:64] of the result.
3756
/// \param __b9
3757
///    An 8-bit integral value used to initialize bits [79:72] of the result.
3758
/// \param __b10
3759
///    An 8-bit integral value used to initialize bits [87:80] of the result.
3760
/// \param __b11
3761
///    An 8-bit integral value used to initialize bits [95:88] of the result.
3762
/// \param __b12
3763
///    An 8-bit integral value used to initialize bits [103:96] of the result.
3764
/// \param __b13
3765
///    An 8-bit integral value used to initialize bits [111:104] of the result.
3766
/// \param __b14
3767
///    An 8-bit integral value used to initialize bits [119:112] of the result.
3768
/// \param __b15
3769
///    An 8-bit integral value used to initialize bits [127:120] of the result.
3770
/// \returns An initialized 128-bit integer vector.
3771
static __inline__ __m128i __DEFAULT_FN_ATTRS
3772
_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3773
              char __b6, char __b7, char __b8, char __b9, char __b10,
3774
              char __b11, char __b12, char __b13, char __b14, char __b15) {
3775
  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3776
                      __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3777
}
3778
 
3779
/// Creates a 128-bit integer vector initialized to zero.
3780
///
3781
/// \headerfile <x86intrin.h>
3782
///
3783
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3784
///
3785
/// \returns An initialized 128-bit integer vector with all elements set to
3786
///    zero.
3787
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3788
  return __extension__(__m128i)(__v2di){0LL, 0LL};
3789
}
3790
 
3791
/// Stores a 128-bit integer vector to a memory location aligned on a
3792
///    128-bit boundary.
3793
///
3794
/// \headerfile <x86intrin.h>
3795
///
3796
/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3797
///
3798
/// \param __p
3799
///    A pointer to an aligned memory location that will receive the integer
3800
///    values.
3801
/// \param __b
3802
///    A 128-bit integer vector containing the values to be moved.
3803
static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3804
                                                          __m128i __b) {
3805
  *__p = __b;
3806
}
3807
 
3808
/// Stores a 128-bit integer vector to an unaligned memory location.
3809
///
3810
/// \headerfile <x86intrin.h>
3811
///
3812
/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3813
///
3814
/// \param __p
3815
///    A pointer to a memory location that will receive the integer values.
3816
/// \param __b
3817
///    A 128-bit integer vector containing the values to be moved.
3818
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3819
                                                           __m128i __b) {
3820
  struct __storeu_si128 {
3821
    __m128i_u __v;
3822
  } __attribute__((__packed__, __may_alias__));
3823
  ((struct __storeu_si128 *)__p)->__v = __b;
3824
}
3825
 
3826
/// Stores a 64-bit integer value from the low element of a 128-bit integer
3827
///    vector.
3828
///
3829
/// \headerfile <x86intrin.h>
3830
///
3831
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3832
///
3833
/// \param __p
3834
///    A pointer to a 64-bit memory location. The address of the memory
3835
///    location does not have to be aligned.
3836
/// \param __b
3837
///    A 128-bit integer vector containing the value to be stored.
3838
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3839
                                                          __m128i __b) {
3840
  struct __storeu_si64 {
3841
    long long __v;
3842
  } __attribute__((__packed__, __may_alias__));
3843
  ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3844
}
3845
 
3846
/// Stores a 32-bit integer value from the low element of a 128-bit integer
3847
///    vector.
3848
///
3849
/// \headerfile <x86intrin.h>
3850
///
3851
/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3852
///
3853
/// \param __p
3854
///    A pointer to a 32-bit memory location. The address of the memory
3855
///    location does not have to be aligned.
3856
/// \param __b
3857
///    A 128-bit integer vector containing the value to be stored.
3858
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3859
                                                          __m128i __b) {
3860
  struct __storeu_si32 {
3861
    int __v;
3862
  } __attribute__((__packed__, __may_alias__));
3863
  ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3864
}
3865
 
3866
/// Stores a 16-bit integer value from the low element of a 128-bit integer
3867
///    vector.
3868
///
3869
/// \headerfile <x86intrin.h>
3870
///
3871
/// This intrinsic does not correspond to a specific instruction.
3872
///
3873
/// \param __p
3874
///    A pointer to a 16-bit memory location. The address of the memory
3875
///    location does not have to be aligned.
3876
/// \param __b
3877
///    A 128-bit integer vector containing the value to be stored.
3878
static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3879
                                                          __m128i __b) {
3880
  struct __storeu_si16 {
3881
    short __v;
3882
  } __attribute__((__packed__, __may_alias__));
3883
  ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3884
}
3885
 
3886
/// Moves bytes selected by the mask from the first operand to the
3887
///    specified unaligned memory location. When a mask bit is 1, the
3888
///    corresponding byte is written, otherwise it is not written.
3889
///
3890
///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3891
///    used again soon). Exception and trap behavior for elements not selected
3892
///    for storage to memory are implementation dependent.
3893
///
3894
/// \headerfile <x86intrin.h>
3895
///
3896
/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3897
///   instruction.
3898
///
3899
/// \param __d
3900
///    A 128-bit integer vector containing the values to be moved.
3901
/// \param __n
3902
///    A 128-bit integer vector containing the mask. The most significant bit of
3903
///    each byte represents the mask bits.
3904
/// \param __p
3905
///    A pointer to an unaligned 128-bit memory location where the specified
3906
///    values are moved.
3907
static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3908
                                                              __m128i __n,
3909
                                                              char *__p) {
3910
  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3911
}
3912
 
3913
/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3914
///    a memory location.
3915
///
3916
/// \headerfile <x86intrin.h>
3917
///
3918
/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3919
///
3920
/// \param __p
3921
///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3922
///    of the integer vector parameter.
3923
/// \param __a
3924
///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3925
///    value to be stored.
3926
static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3927
                                                           __m128i __a) {
3928
  struct __mm_storel_epi64_struct {
3929
    long long __u;
3930
  } __attribute__((__packed__, __may_alias__));
3931
  ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3932
}
3933
 
3934
/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3935
///    aligned memory location.
3936
///
3937
///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3938
///    used again soon).
3939
///
3940
/// \headerfile <x86intrin.h>
3941
///
3942
/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3943
///
3944
/// \param __p
3945
///    A pointer to the 128-bit aligned memory location used to store the value.
3946
/// \param __a
3947
///    A vector of [2 x double] containing the 64-bit values to be stored.
3948
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
3949
                                                        __m128d __a) {
3950
  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3951
}
3952
 
3953
/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3954
///
3955
///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3956
///    used again soon).
3957
///
3958
/// \headerfile <x86intrin.h>
3959
///
3960
/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3961
///
3962
/// \param __p
3963
///    A pointer to the 128-bit aligned memory location used to store the value.
3964
/// \param __a
3965
///    A 128-bit integer vector containing the values to be stored.
3966
static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
3967
                                                           __m128i __a) {
3968
  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3969
}
3970
 
3971
/// Stores a 32-bit integer value in the specified memory location.
3972
///
3973
///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3974
///    used again soon).
3975
///
3976
/// \headerfile <x86intrin.h>
3977
///
3978
/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3979
///
3980
/// \param __p
3981
///    A pointer to the 32-bit memory location used to store the value.
3982
/// \param __a
3983
///    A 32-bit integer containing the value to be stored.
3984
static __inline__ void
3985
    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3986
    _mm_stream_si32(int *__p, int __a) {
3987
  __builtin_ia32_movnti(__p, __a);
3988
}
3989
 
3990
#ifdef __x86_64__
3991
/// Stores a 64-bit integer value in the specified memory location.
3992
///
3993
///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3994
///    used again soon).
3995
///
3996
/// \headerfile <x86intrin.h>
3997
///
3998
/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3999
///
4000
/// \param __p
4001
///    A pointer to the 64-bit memory location used to store the value.
4002
/// \param __a
4003
///    A 64-bit integer containing the value to be stored.
4004
static __inline__ void
4005
    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4006
    _mm_stream_si64(long long *__p, long long __a) {
4007
  __builtin_ia32_movnti64(__p, __a);
4008
}
4009
#endif
4010
 
4011
#if defined(__cplusplus)
4012
extern "C" {
4013
#endif
4014
 
4015
/// The cache line containing \a __p is flushed and invalidated from all
4016
///    caches in the coherency domain.
4017
///
4018
/// \headerfile <x86intrin.h>
4019
///
4020
/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4021
///
4022
/// \param __p
4023
///    A pointer to the memory location used to identify the cache line to be
4024
///    flushed.
4025
void _mm_clflush(void const *__p);
4026
 
4027
/// Forces strong memory ordering (serialization) between load
4028
///    instructions preceding this instruction and load instructions following
4029
///    this instruction, ensuring the system completes all previous loads before
4030
///    executing subsequent loads.
4031
///
4032
/// \headerfile <x86intrin.h>
4033
///
4034
/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4035
///
4036
void _mm_lfence(void);
4037
 
4038
/// Forces strong memory ordering (serialization) between load and store
4039
///    instructions preceding this instruction and load and store instructions
4040
///    following this instruction, ensuring that the system completes all
4041
///    previous memory accesses before executing subsequent memory accesses.
4042
///
4043
/// \headerfile <x86intrin.h>
4044
///
4045
/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4046
///
4047
void _mm_mfence(void);
4048
 
4049
#if defined(__cplusplus)
4050
} // extern "C"
4051
#endif
4052
 
4053
/// Converts 16-bit signed integers from both 128-bit integer vector
4054
///    operands into 8-bit signed integers, and packs the results into the
4055
///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4056
///    Negative values less than 0x80 are saturated to 0x80.
4057
///
4058
/// \headerfile <x86intrin.h>
4059
///
4060
/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4061
///
4062
/// \param __a
4063
///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4064
///   a signed integer and is converted to a 8-bit signed integer with
4065
///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4066
///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4067
///   written to the lower 64 bits of the result.
4068
/// \param __b
4069
///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4070
///   a signed integer and is converted to a 8-bit signed integer with
4071
///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4072
///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4073
///   written to the higher 64 bits of the result.
4074
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4075
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4076
                                                             __m128i __b) {
4077
  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4078
}
4079
 
4080
/// Converts 32-bit signed integers from both 128-bit integer vector
4081
///    operands into 16-bit signed integers, and packs the results into the
4082
///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4083
///    Negative values less than 0x8000 are saturated to 0x8000.
4084
///
4085
/// \headerfile <x86intrin.h>
4086
///
4087
/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4088
///
4089
/// \param __a
4090
///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4091
///    a signed integer and is converted to a 16-bit signed integer with
4092
///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4093
///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4094
///    are written to the lower 64 bits of the result.
4095
/// \param __b
4096
///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4097
///    a signed integer and is converted to a 16-bit signed integer with
4098
///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4099
///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4100
///    are written to the higher 64 bits of the result.
4101
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4102
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4103
                                                             __m128i __b) {
4104
  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4105
}
4106
 
4107
/// Converts 16-bit signed integers from both 128-bit integer vector
4108
///    operands into 8-bit unsigned integers, and packs the results into the
4109
///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4110
///    than 0x00 are saturated to 0x00.
4111
///
4112
/// \headerfile <x86intrin.h>
4113
///
4114
/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4115
///
4116
/// \param __a
4117
///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4118
///    a signed integer and is converted to an 8-bit unsigned integer with
4119
///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4120
///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4121
///    written to the lower 64 bits of the result.
4122
/// \param __b
4123
///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4124
///    a signed integer and is converted to an 8-bit unsigned integer with
4125
///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4126
///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4127
///    written to the higher 64 bits of the result.
4128
/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4129
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4130
                                                              __m128i __b) {
4131
  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4132
}
4133
 
4134
/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4135
///    the immediate-value parameter as a selector.
4136
///
4137
/// \headerfile <x86intrin.h>
4138
///
4139
/// \code
4140
/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4141
/// \endcode
4142
///
4143
/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4144
///
4145
/// \param a
4146
///    A 128-bit integer vector.
4147
/// \param imm
4148
///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4149
///    to bits[15:0] of the result. \n
4150
///    000: assign values from bits [15:0] of \a a. \n
4151
///    001: assign values from bits [31:16] of \a a. \n
4152
///    010: assign values from bits [47:32] of \a a. \n
4153
///    011: assign values from bits [63:48] of \a a. \n
4154
///    100: assign values from bits [79:64] of \a a. \n
4155
///    101: assign values from bits [95:80] of \a a. \n
4156
///    110: assign values from bits [111:96] of \a a. \n
4157
///    111: assign values from bits [127:112] of \a a.
4158
/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4159
///    integer vector parameter and the remaining bits are assigned zeros.
4160
#define _mm_extract_epi16(a, imm)                                              \
4161
  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4162
                                                    (int)(imm)))
4163
 
4164
/// Constructs a 128-bit integer vector by first making a copy of the
4165
///    128-bit integer vector parameter, and then inserting the lower 16 bits
4166
///    of an integer parameter into an offset specified by the immediate-value
4167
///    parameter.
4168
///
4169
/// \headerfile <x86intrin.h>
4170
///
4171
/// \code
4172
/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4173
/// \endcode
4174
///
4175
/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4176
///
4177
/// \param a
4178
///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4179
///    result and then one of the eight elements in the result is replaced by
4180
///    the lower 16 bits of \a b.
4181
/// \param b
4182
///    An integer. The lower 16 bits of this parameter are written to the
4183
///    result beginning at an offset specified by \a imm.
4184
/// \param imm
4185
///    An immediate value specifying the bit offset in the result at which the
4186
///    lower 16 bits of \a b are written.
4187
/// \returns A 128-bit integer vector containing the constructed values.
4188
#define _mm_insert_epi16(a, b, imm)                                            \
4189
  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4190
                                        (int)(imm)))
4191
 
4192
/// Copies the values of the most significant bits from each 8-bit
4193
///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4194
///    value, zero-extends the value, and writes it to the destination.
4195
///
4196
/// \headerfile <x86intrin.h>
4197
///
4198
/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4199
///
4200
/// \param __a
4201
///    A 128-bit integer vector containing the values with bits to be extracted.
4202
/// \returns The most significant bits from each 8-bit element in \a __a,
4203
///    written to bits [15:0]. The other bits are assigned zeros.
4204
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4205
  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4206
}
4207
 
4208
/// Constructs a 128-bit integer vector by shuffling four 32-bit
4209
///    elements of a 128-bit integer vector parameter, using the immediate-value
4210
///    parameter as a specifier.
4211
///
4212
/// \headerfile <x86intrin.h>
4213
///
4214
/// \code
4215
/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4216
/// \endcode
4217
///
4218
/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4219
///
4220
/// \param a
4221
///    A 128-bit integer vector containing the values to be copied.
4222
/// \param imm
4223
///    An immediate value containing an 8-bit value specifying which elements to
4224
///    copy from a. The destinations within the 128-bit destination are assigned
4225
///    values as follows: \n
4226
///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4227
///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4228
///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4229
///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4230
///    Bit value assignments: \n
4231
///    00: assign values from bits [31:0] of \a a. \n
4232
///    01: assign values from bits [63:32] of \a a. \n
4233
///    10: assign values from bits [95:64] of \a a. \n
4234
///    11: assign values from bits [127:96] of \a a. \n
4235
///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4236
///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4237
///    <c>[b6, b4, b2, b0]</c>.
4238
/// \returns A 128-bit integer vector containing the shuffled values.
4239
#define _mm_shuffle_epi32(a, imm)                                              \
4240
  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4241
 
4242
/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4243
///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4244
///    value parameter as a specifier.
4245
///
4246
/// \headerfile <x86intrin.h>
4247
///
4248
/// \code
4249
/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4250
/// \endcode
4251
///
4252
/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4253
///
4254
/// \param a
4255
///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4256
///    [127:64] of the result.
4257
/// \param imm
4258
///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4259
///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4260
///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4261
///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4262
///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4263
///    Bit value assignments: \n
4264
///    00: assign values from bits [15:0] of \a a. \n
4265
///    01: assign values from bits [31:16] of \a a. \n
4266
///    10: assign values from bits [47:32] of \a a. \n
4267
///    11: assign values from bits [63:48] of \a a. \n
4268
///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4269
///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4270
///    <c>[b6, b4, b2, b0]</c>.
4271
/// \returns A 128-bit integer vector containing the shuffled values.
4272
#define _mm_shufflelo_epi16(a, imm)                                            \
4273
  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4274
 
4275
/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4276
///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4277
///    value parameter as a specifier.
4278
///
4279
/// \headerfile <x86intrin.h>
4280
///
4281
/// \code
4282
/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4283
/// \endcode
4284
///
4285
/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4286
///
4287
/// \param a
4288
///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4289
///    [63:0] of the result.
4290
/// \param imm
4291
///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4292
///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4293
///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4294
///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4295
///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4296
///    Bit value assignments: \n
4297
///    00: assign values from bits [79:64] of \a a. \n
4298
///    01: assign values from bits [95:80] of \a a. \n
4299
///    10: assign values from bits [111:96] of \a a. \n
4300
///    11: assign values from bits [127:112] of \a a. \n
4301
///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4302
///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4303
///    <c>[b6, b4, b2, b0]</c>.
4304
/// \returns A 128-bit integer vector containing the shuffled values.
4305
#define _mm_shufflehi_epi16(a, imm)                                            \
4306
  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4307
 
4308
/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4309
///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4310
///
4311
/// \headerfile <x86intrin.h>
4312
///
4313
/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4314
///   instruction.
4315
///
4316
/// \param __a
4317
///    A 128-bit vector of [16 x i8].
4318
///    Bits [71:64] are written to bits [7:0] of the result. \n
4319
///    Bits [79:72] are written to bits [23:16] of the result. \n
4320
///    Bits [87:80] are written to bits [39:32] of the result. \n
4321
///    Bits [95:88] are written to bits [55:48] of the result. \n
4322
///    Bits [103:96] are written to bits [71:64] of the result. \n
4323
///    Bits [111:104] are written to bits [87:80] of the result. \n
4324
///    Bits [119:112] are written to bits [103:96] of the result. \n
4325
///    Bits [127:120] are written to bits [119:112] of the result.
4326
/// \param __b
4327
///    A 128-bit vector of [16 x i8]. \n
4328
///    Bits [71:64] are written to bits [15:8] of the result. \n
4329
///    Bits [79:72] are written to bits [31:24] of the result. \n
4330
///    Bits [87:80] are written to bits [47:40] of the result. \n
4331
///    Bits [95:88] are written to bits [63:56] of the result. \n
4332
///    Bits [103:96] are written to bits [79:72] of the result. \n
4333
///    Bits [111:104] are written to bits [95:88] of the result. \n
4334
///    Bits [119:112] are written to bits [111:104] of the result. \n
4335
///    Bits [127:120] are written to bits [127:120] of the result.
4336
/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4337
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4338
                                                               __m128i __b) {
4339
  return (__m128i)__builtin_shufflevector(
4340
      (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4341
      16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4342
}
4343
 
4344
/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4345
///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4346
///
4347
/// \headerfile <x86intrin.h>
4348
///
4349
/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4350
///   instruction.
4351
///
4352
/// \param __a
4353
///    A 128-bit vector of [8 x i16].
4354
///    Bits [79:64] are written to bits [15:0] of the result. \n
4355
///    Bits [95:80] are written to bits [47:32] of the result. \n
4356
///    Bits [111:96] are written to bits [79:64] of the result. \n
4357
///    Bits [127:112] are written to bits [111:96] of the result.
4358
/// \param __b
4359
///    A 128-bit vector of [8 x i16].
4360
///    Bits [79:64] are written to bits [31:16] of the result. \n
4361
///    Bits [95:80] are written to bits [63:48] of the result. \n
4362
///    Bits [111:96] are written to bits [95:80] of the result. \n
4363
///    Bits [127:112] are written to bits [127:112] of the result.
4364
/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4365
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4366
                                                                __m128i __b) {
4367
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4368
                                          8 + 5, 6, 8 + 6, 7, 8 + 7);
4369
}
4370
 
4371
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4372
///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4373
///
4374
/// \headerfile <x86intrin.h>
4375
///
4376
/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4377
///   instruction.
4378
///
4379
/// \param __a
4380
///    A 128-bit vector of [4 x i32]. \n
4381
///    Bits [95:64] are written to bits [31:0] of the destination. \n
4382
///    Bits [127:96] are written to bits [95:64] of the destination.
4383
/// \param __b
4384
///    A 128-bit vector of [4 x i32]. \n
4385
///    Bits [95:64] are written to bits [64:32] of the destination. \n
4386
///    Bits [127:96] are written to bits [127:96] of the destination.
4387
/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4388
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4389
                                                                __m128i __b) {
4390
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4391
                                          4 + 3);
4392
}
4393
 
4394
/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4395
///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4396
///
4397
/// \headerfile <x86intrin.h>
4398
///
4399
/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4400
///   instruction.
4401
///
4402
/// \param __a
4403
///    A 128-bit vector of [2 x i64]. \n
4404
///    Bits [127:64] are written to bits [63:0] of the destination.
4405
/// \param __b
4406
///    A 128-bit vector of [2 x i64]. \n
4407
///    Bits [127:64] are written to bits [127:64] of the destination.
4408
/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4409
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4410
                                                                __m128i __b) {
4411
  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4412
}
4413
 
4414
/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4415
///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4416
///
4417
/// \headerfile <x86intrin.h>
4418
///
4419
/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4420
///   instruction.
4421
///
4422
/// \param __a
4423
///    A 128-bit vector of [16 x i8]. \n
4424
///    Bits [7:0] are written to bits [7:0] of the result. \n
4425
///    Bits [15:8] are written to bits [23:16] of the result. \n
4426
///    Bits [23:16] are written to bits [39:32] of the result. \n
4427
///    Bits [31:24] are written to bits [55:48] of the result. \n
4428
///    Bits [39:32] are written to bits [71:64] of the result. \n
4429
///    Bits [47:40] are written to bits [87:80] of the result. \n
4430
///    Bits [55:48] are written to bits [103:96] of the result. \n
4431
///    Bits [63:56] are written to bits [119:112] of the result.
4432
/// \param __b
4433
///    A 128-bit vector of [16 x i8].
4434
///    Bits [7:0] are written to bits [15:8] of the result. \n
4435
///    Bits [15:8] are written to bits [31:24] of the result. \n
4436
///    Bits [23:16] are written to bits [47:40] of the result. \n
4437
///    Bits [31:24] are written to bits [63:56] of the result. \n
4438
///    Bits [39:32] are written to bits [79:72] of the result. \n
4439
///    Bits [47:40] are written to bits [95:88] of the result. \n
4440
///    Bits [55:48] are written to bits [111:104] of the result. \n
4441
///    Bits [63:56] are written to bits [127:120] of the result.
4442
/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4443
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4444
                                                               __m128i __b) {
4445
  return (__m128i)__builtin_shufflevector(
4446
      (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4447
      16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4448
}
4449
 
4450
/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4451
///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4452
///    [8 x i16].
4453
///
4454
/// \headerfile <x86intrin.h>
4455
///
4456
/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4457
///   instruction.
4458
///
4459
/// \param __a
4460
///    A 128-bit vector of [8 x i16].
4461
///    Bits [15:0] are written to bits [15:0] of the result. \n
4462
///    Bits [31:16] are written to bits [47:32] of the result. \n
4463
///    Bits [47:32] are written to bits [79:64] of the result. \n
4464
///    Bits [63:48] are written to bits [111:96] of the result.
4465
/// \param __b
4466
///    A 128-bit vector of [8 x i16].
4467
///    Bits [15:0] are written to bits [31:16] of the result. \n
4468
///    Bits [31:16] are written to bits [63:48] of the result. \n
4469
///    Bits [47:32] are written to bits [95:80] of the result. \n
4470
///    Bits [63:48] are written to bits [127:112] of the result.
4471
/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4472
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4473
                                                                __m128i __b) {
4474
  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4475
                                          8 + 1, 2, 8 + 2, 3, 8 + 3);
4476
}
4477
 
4478
/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4479
///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4480
///
4481
/// \headerfile <x86intrin.h>
4482
///
4483
/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4484
///   instruction.
4485
///
4486
/// \param __a
4487
///    A 128-bit vector of [4 x i32]. \n
4488
///    Bits [31:0] are written to bits [31:0] of the destination. \n
4489
///    Bits [63:32] are written to bits [95:64] of the destination.
4490
/// \param __b
4491
///    A 128-bit vector of [4 x i32]. \n
4492
///    Bits [31:0] are written to bits [64:32] of the destination. \n
4493
///    Bits [63:32] are written to bits [127:96] of the destination.
4494
/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4495
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4496
                                                                __m128i __b) {
4497
  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4498
                                          4 + 1);
4499
}
4500
 
4501
/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4502
///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4503
///
4504
/// \headerfile <x86intrin.h>
4505
///
4506
/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4507
///   instruction.
4508
///
4509
/// \param __a
4510
///    A 128-bit vector of [2 x i64]. \n
4511
///    Bits [63:0] are written to bits [63:0] of the destination. \n
4512
/// \param __b
4513
///    A 128-bit vector of [2 x i64]. \n
4514
///    Bits [63:0] are written to bits [127:64] of the destination. \n
4515
/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4516
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4517
                                                                __m128i __b) {
4518
  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4519
}
4520
 
4521
/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4522
///    integer.
4523
///
4524
/// \headerfile <x86intrin.h>
4525
///
4526
/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4527
///
4528
/// \param __a
4529
///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4530
///    destination.
4531
/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4532
static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4533
  return (__m64)__a[0];
4534
}
4535
 
4536
/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4537
///    upper bits.
4538
///
4539
/// \headerfile <x86intrin.h>
4540
///
4541
/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4542
///
4543
/// \param __a
4544
///    A 64-bit value.
4545
/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4546
///    the operand. The upper 64 bits are assigned zeros.
4547
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4548
  return __extension__(__m128i)(__v2di){(long long)__a, 0};
4549
}
4550
 
4551
/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4552
///    integer vector, zeroing the upper bits.
4553
///
4554
/// \headerfile <x86intrin.h>
4555
///
4556
/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4557
///
4558
/// \param __a
4559
///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4560
///    destination.
4561
/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4562
///    the operand. The upper 64 bits are assigned zeros.
4563
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4564
  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4565
}
4566
 
4567
/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4568
///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4569
///    double].
4570
///
4571
/// \headerfile <x86intrin.h>
4572
///
4573
/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4574
///
4575
/// \param __a
4576
///    A 128-bit vector of [2 x double]. \n
4577
///    Bits [127:64] are written to bits [63:0] of the destination.
4578
/// \param __b
4579
///    A 128-bit vector of [2 x double]. \n
4580
///    Bits [127:64] are written to bits [127:64] of the destination.
4581
/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4582
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4583
                                                             __m128d __b) {
4584
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4585
}
4586
 
4587
/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4588
///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4589
///    double].
4590
///
4591
/// \headerfile <x86intrin.h>
4592
///
4593
/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4594
///
4595
/// \param __a
4596
///    A 128-bit vector of [2 x double]. \n
4597
///    Bits [63:0] are written to bits [63:0] of the destination.
4598
/// \param __b
4599
///    A 128-bit vector of [2 x double]. \n
4600
///    Bits [63:0] are written to bits [127:64] of the destination.
4601
/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4602
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4603
                                                             __m128d __b) {
4604
  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4605
}
4606
 
4607
/// Extracts the sign bits of the double-precision values in the 128-bit
4608
///    vector of [2 x double], zero-extends the value, and writes it to the
4609
///    low-order bits of the destination.
4610
///
4611
/// \headerfile <x86intrin.h>
4612
///
4613
/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4614
///
4615
/// \param __a
4616
///    A 128-bit vector of [2 x double] containing the values with sign bits to
4617
///    be extracted.
4618
/// \returns The sign bits from each of the double-precision elements in \a __a,
4619
///    written to bits [1:0]. The remaining bits are assigned values of zero.
4620
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4621
  return __builtin_ia32_movmskpd((__v2df)__a);
4622
}
4623
 
4624
/// Constructs a 128-bit floating-point vector of [2 x double] from two
4625
///    128-bit vector parameters of [2 x double], using the immediate-value
4626
///     parameter as a specifier.
4627
///
4628
/// \headerfile <x86intrin.h>
4629
///
4630
/// \code
4631
/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4632
/// \endcode
4633
///
4634
/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4635
///
4636
/// \param a
4637
///    A 128-bit vector of [2 x double].
4638
/// \param b
4639
///    A 128-bit vector of [2 x double].
4640
/// \param i
4641
///    An 8-bit immediate value. The least significant two bits specify which
4642
///    elements to copy from \a a and \a b: \n
4643
///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4644
///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4645
///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4646
///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4647
///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4648
///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4649
///    <c>[b1, b0]</c>.
4650
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4651
#define _mm_shuffle_pd(a, b, i)                                                \
4652
  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4653
                                  (int)(i)))
4654
 
4655
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4656
///    floating-point vector of [4 x float].
4657
///
4658
/// \headerfile <x86intrin.h>
4659
///
4660
/// This intrinsic has no corresponding instruction.
4661
///
4662
/// \param __a
4663
///    A 128-bit floating-point vector of [2 x double].
4664
/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4665
///    bitwise pattern as the parameter.
4666
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4667
  return (__m128)__a;
4668
}
4669
 
4670
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4671
///    integer vector.
4672
///
4673
/// \headerfile <x86intrin.h>
4674
///
4675
/// This intrinsic has no corresponding instruction.
4676
///
4677
/// \param __a
4678
///    A 128-bit floating-point vector of [2 x double].
4679
/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4680
///    parameter.
4681
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4682
  return (__m128i)__a;
4683
}
4684
 
4685
/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4686
///    floating-point vector of [2 x double].
4687
///
4688
/// \headerfile <x86intrin.h>
4689
///
4690
/// This intrinsic has no corresponding instruction.
4691
///
4692
/// \param __a
4693
///    A 128-bit floating-point vector of [4 x float].
4694
/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4695
///    bitwise pattern as the parameter.
4696
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4697
  return (__m128d)__a;
4698
}
4699
 
4700
/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4701
///    integer vector.
4702
///
4703
/// \headerfile <x86intrin.h>
4704
///
4705
/// This intrinsic has no corresponding instruction.
4706
///
4707
/// \param __a
4708
///    A 128-bit floating-point vector of [4 x float].
4709
/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4710
///    parameter.
4711
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4712
  return (__m128i)__a;
4713
}
4714
 
4715
/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4716
///    of [4 x float].
4717
///
4718
/// \headerfile <x86intrin.h>
4719
///
4720
/// This intrinsic has no corresponding instruction.
4721
///
4722
/// \param __a
4723
///    A 128-bit integer vector.
4724
/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4725
///    bitwise pattern as the parameter.
4726
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4727
  return (__m128)__a;
4728
}
4729
 
4730
/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4731
///    of [2 x double].
4732
///
4733
/// \headerfile <x86intrin.h>
4734
///
4735
/// This intrinsic has no corresponding instruction.
4736
///
4737
/// \param __a
4738
///    A 128-bit integer vector.
4739
/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4740
///    bitwise pattern as the parameter.
4741
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4742
  return (__m128d)__a;
4743
}
4744
 
4745
#if defined(__cplusplus)
4746
extern "C" {
4747
#endif
4748
 
4749
/// Indicates that a spin loop is being executed for the purposes of
4750
///    optimizing power consumption during the loop.
4751
///
4752
/// \headerfile <x86intrin.h>
4753
///
4754
/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4755
///
4756
void _mm_pause(void);
4757
 
4758
#if defined(__cplusplus)
4759
} // extern "C"
4760
#endif
4761
#undef __DEFAULT_FN_ATTRS
4762
#undef __DEFAULT_FN_ATTRS_MMX
4763
 
4764
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4765
 
4766
#define _MM_DENORMALS_ZERO_ON (0x0040U)
4767
#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4768
 
4769
#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4770
 
4771
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4772
#define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4773
  (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4774
 
4775
#endif /* __EMMINTRIN_H */