Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
#ifndef __XMMINTRIN_H
11
#define __XMMINTRIN_H
12
 
13
#if !defined(__i386__) && !defined(__x86_64__)
14
#error "This header is only meant to be used on x86 and x64 architecture"
15
#endif
16
 
17
#include <mmintrin.h>
18
 
19
typedef int __v4si __attribute__((__vector_size__(16)));
20
typedef float __v4sf __attribute__((__vector_size__(16)));
21
typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
 
23
typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
 
25
/* Unsigned types */
26
typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
 
28
/* This header should only be included in a hosted environment as it depends on
29
 * a standard library to provide allocation routines. */
30
#if __STDC_HOSTED__
31
#include <mm_malloc.h>
32
#endif
33
 
34
/* Define the default attributes for the functions in this file. */
35
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
36
#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
37
 
38
/// Adds the 32-bit float values in the low-order bits of the operands.
39
///
40
/// \headerfile <x86intrin.h>
41
///
42
/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
43
///
44
/// \param __a
45
///    A 128-bit vector of [4 x float] containing one of the source operands.
46
///    The lower 32 bits of this operand are used in the calculation.
47
/// \param __b
48
///    A 128-bit vector of [4 x float] containing one of the source operands.
49
///    The lower 32 bits of this operand are used in the calculation.
50
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
51
///    of the lower 32 bits of both operands. The upper 96 bits are copied from
52
///    the upper 96 bits of the first source operand.
53
static __inline__ __m128 __DEFAULT_FN_ATTRS
54
_mm_add_ss(__m128 __a, __m128 __b)
55
{
56
  __a[0] += __b[0];
57
  return __a;
58
}
59
 
60
/// Adds two 128-bit vectors of [4 x float], and returns the results of
61
///    the addition.
62
///
63
/// \headerfile <x86intrin.h>
64
///
65
/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
66
///
67
/// \param __a
68
///    A 128-bit vector of [4 x float] containing one of the source operands.
69
/// \param __b
70
///    A 128-bit vector of [4 x float] containing one of the source operands.
71
/// \returns A 128-bit vector of [4 x float] containing the sums of both
72
///    operands.
73
static __inline__ __m128 __DEFAULT_FN_ATTRS
74
_mm_add_ps(__m128 __a, __m128 __b)
75
{
76
  return (__m128)((__v4sf)__a + (__v4sf)__b);
77
}
78
 
79
/// Subtracts the 32-bit float value in the low-order bits of the second
80
///    operand from the corresponding value in the first operand.
81
///
82
/// \headerfile <x86intrin.h>
83
///
84
/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
85
///
86
/// \param __a
87
///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
88
///    of this operand are used in the calculation.
89
/// \param __b
90
///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
91
///    bits of this operand are used in the calculation.
92
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
93
///    difference of the lower 32 bits of both operands. The upper 96 bits are
94
///    copied from the upper 96 bits of the first source operand.
95
static __inline__ __m128 __DEFAULT_FN_ATTRS
96
_mm_sub_ss(__m128 __a, __m128 __b)
97
{
98
  __a[0] -= __b[0];
99
  return __a;
100
}
101
 
102
/// Subtracts each of the values of the second operand from the first
103
///    operand, both of which are 128-bit vectors of [4 x float] and returns
104
///    the results of the subtraction.
105
///
106
/// \headerfile <x86intrin.h>
107
///
108
/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
109
///
110
/// \param __a
111
///    A 128-bit vector of [4 x float] containing the minuend.
112
/// \param __b
113
///    A 128-bit vector of [4 x float] containing the subtrahend.
114
/// \returns A 128-bit vector of [4 x float] containing the differences between
115
///    both operands.
116
static __inline__ __m128 __DEFAULT_FN_ATTRS
117
_mm_sub_ps(__m128 __a, __m128 __b)
118
{
119
  return (__m128)((__v4sf)__a - (__v4sf)__b);
120
}
121
 
122
/// Multiplies two 32-bit float values in the low-order bits of the
123
///    operands.
124
///
125
/// \headerfile <x86intrin.h>
126
///
127
/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
128
///
129
/// \param __a
130
///    A 128-bit vector of [4 x float] containing one of the source operands.
131
///    The lower 32 bits of this operand are used in the calculation.
132
/// \param __b
133
///    A 128-bit vector of [4 x float] containing one of the source operands.
134
///    The lower 32 bits of this operand are used in the calculation.
135
/// \returns A 128-bit vector of [4 x float] containing the product of the lower
136
///    32 bits of both operands. The upper 96 bits are copied from the upper 96
137
///    bits of the first source operand.
138
static __inline__ __m128 __DEFAULT_FN_ATTRS
139
_mm_mul_ss(__m128 __a, __m128 __b)
140
{
141
  __a[0] *= __b[0];
142
  return __a;
143
}
144
 
145
/// Multiplies two 128-bit vectors of [4 x float] and returns the
146
///    results of the multiplication.
147
///
148
/// \headerfile <x86intrin.h>
149
///
150
/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
151
///
152
/// \param __a
153
///    A 128-bit vector of [4 x float] containing one of the source operands.
154
/// \param __b
155
///    A 128-bit vector of [4 x float] containing one of the source operands.
156
/// \returns A 128-bit vector of [4 x float] containing the products of both
157
///    operands.
158
static __inline__ __m128 __DEFAULT_FN_ATTRS
159
_mm_mul_ps(__m128 __a, __m128 __b)
160
{
161
  return (__m128)((__v4sf)__a * (__v4sf)__b);
162
}
163
 
164
/// Divides the value in the low-order 32 bits of the first operand by
165
///    the corresponding value in the second operand.
166
///
167
/// \headerfile <x86intrin.h>
168
///
169
/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
170
///
171
/// \param __a
172
///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
173
///    bits of this operand are used in the calculation.
174
/// \param __b
175
///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
176
///    of this operand are used in the calculation.
177
/// \returns A 128-bit vector of [4 x float] containing the quotients of the
178
///    lower 32 bits of both operands. The upper 96 bits are copied from the
179
///    upper 96 bits of the first source operand.
180
static __inline__ __m128 __DEFAULT_FN_ATTRS
181
_mm_div_ss(__m128 __a, __m128 __b)
182
{
183
  __a[0] /= __b[0];
184
  return __a;
185
}
186
 
187
/// Divides two 128-bit vectors of [4 x float].
188
///
189
/// \headerfile <x86intrin.h>
190
///
191
/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
192
///
193
/// \param __a
194
///    A 128-bit vector of [4 x float] containing the dividend.
195
/// \param __b
196
///    A 128-bit vector of [4 x float] containing the divisor.
197
/// \returns A 128-bit vector of [4 x float] containing the quotients of both
198
///    operands.
199
static __inline__ __m128 __DEFAULT_FN_ATTRS
200
_mm_div_ps(__m128 __a, __m128 __b)
201
{
202
  return (__m128)((__v4sf)__a / (__v4sf)__b);
203
}
204
 
205
/// Calculates the square root of the value stored in the low-order bits
206
///    of a 128-bit vector of [4 x float].
207
///
208
/// \headerfile <x86intrin.h>
209
///
210
/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
211
///
212
/// \param __a
213
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
214
///    used in the calculation.
215
/// \returns A 128-bit vector of [4 x float] containing the square root of the
216
///    value in the low-order bits of the operand.
217
static __inline__ __m128 __DEFAULT_FN_ATTRS
218
_mm_sqrt_ss(__m128 __a)
219
{
220
  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
221
}
222
 
223
/// Calculates the square roots of the values stored in a 128-bit vector
224
///    of [4 x float].
225
///
226
/// \headerfile <x86intrin.h>
227
///
228
/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
229
///
230
/// \param __a
231
///    A 128-bit vector of [4 x float].
232
/// \returns A 128-bit vector of [4 x float] containing the square roots of the
233
///    values in the operand.
234
static __inline__ __m128 __DEFAULT_FN_ATTRS
235
_mm_sqrt_ps(__m128 __a)
236
{
237
  return __builtin_ia32_sqrtps((__v4sf)__a);
238
}
239
 
240
/// Calculates the approximate reciprocal of the value stored in the
241
///    low-order bits of a 128-bit vector of [4 x float].
242
///
243
/// \headerfile <x86intrin.h>
244
///
245
/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
246
///
247
/// \param __a
248
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
249
///    used in the calculation.
250
/// \returns A 128-bit vector of [4 x float] containing the approximate
251
///    reciprocal of the value in the low-order bits of the operand.
252
static __inline__ __m128 __DEFAULT_FN_ATTRS
253
_mm_rcp_ss(__m128 __a)
254
{
255
  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
256
}
257
 
258
/// Calculates the approximate reciprocals of the values stored in a
259
///    128-bit vector of [4 x float].
260
///
261
/// \headerfile <x86intrin.h>
262
///
263
/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
264
///
265
/// \param __a
266
///    A 128-bit vector of [4 x float].
267
/// \returns A 128-bit vector of [4 x float] containing the approximate
268
///    reciprocals of the values in the operand.
269
static __inline__ __m128 __DEFAULT_FN_ATTRS
270
_mm_rcp_ps(__m128 __a)
271
{
272
  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
273
}
274
 
275
/// Calculates the approximate reciprocal of the square root of the value
276
///    stored in the low-order bits of a 128-bit vector of [4 x float].
277
///
278
/// \headerfile <x86intrin.h>
279
///
280
/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
281
///
282
/// \param __a
283
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
284
///    used in the calculation.
285
/// \returns A 128-bit vector of [4 x float] containing the approximate
286
///    reciprocal of the square root of the value in the low-order bits of the
287
///    operand.
288
static __inline__ __m128 __DEFAULT_FN_ATTRS
289
_mm_rsqrt_ss(__m128 __a)
290
{
291
  return __builtin_ia32_rsqrtss((__v4sf)__a);
292
}
293
 
294
/// Calculates the approximate reciprocals of the square roots of the
295
///    values stored in a 128-bit vector of [4 x float].
296
///
297
/// \headerfile <x86intrin.h>
298
///
299
/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
300
///
301
/// \param __a
302
///    A 128-bit vector of [4 x float].
303
/// \returns A 128-bit vector of [4 x float] containing the approximate
304
///    reciprocals of the square roots of the values in the operand.
305
static __inline__ __m128 __DEFAULT_FN_ATTRS
306
_mm_rsqrt_ps(__m128 __a)
307
{
308
  return __builtin_ia32_rsqrtps((__v4sf)__a);
309
}
310
 
311
/// Compares two 32-bit float values in the low-order bits of both
312
///    operands and returns the lesser value in the low-order bits of the
313
///    vector of [4 x float].
314
///
315
/// \headerfile <x86intrin.h>
316
///
317
/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
318
///
319
/// \param __a
320
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
321
///    32 bits of this operand are used in the comparison.
322
/// \param __b
323
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
324
///    32 bits of this operand are used in the comparison.
325
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
326
///    minimum value between both operands. The upper 96 bits are copied from
327
///    the upper 96 bits of the first source operand.
328
static __inline__ __m128 __DEFAULT_FN_ATTRS
329
_mm_min_ss(__m128 __a, __m128 __b)
330
{
331
  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
332
}
333
 
334
/// Compares two 128-bit vectors of [4 x float] and returns the lesser
335
///    of each pair of values.
336
///
337
/// \headerfile <x86intrin.h>
338
///
339
/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
340
///
341
/// \param __a
342
///    A 128-bit vector of [4 x float] containing one of the operands.
343
/// \param __b
344
///    A 128-bit vector of [4 x float] containing one of the operands.
345
/// \returns A 128-bit vector of [4 x float] containing the minimum values
346
///    between both operands.
347
static __inline__ __m128 __DEFAULT_FN_ATTRS
348
_mm_min_ps(__m128 __a, __m128 __b)
349
{
350
  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
351
}
352
 
353
/// Compares two 32-bit float values in the low-order bits of both
354
///    operands and returns the greater value in the low-order bits of a 128-bit
355
///    vector of [4 x float].
356
///
357
/// \headerfile <x86intrin.h>
358
///
359
/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
360
///
361
/// \param __a
362
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
363
///    32 bits of this operand are used in the comparison.
364
/// \param __b
365
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
366
///    32 bits of this operand are used in the comparison.
367
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
368
///    maximum value between both operands. The upper 96 bits are copied from
369
///    the upper 96 bits of the first source operand.
370
static __inline__ __m128 __DEFAULT_FN_ATTRS
371
_mm_max_ss(__m128 __a, __m128 __b)
372
{
373
  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
374
}
375
 
376
/// Compares two 128-bit vectors of [4 x float] and returns the greater
377
///    of each pair of values.
378
///
379
/// \headerfile <x86intrin.h>
380
///
381
/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
382
///
383
/// \param __a
384
///    A 128-bit vector of [4 x float] containing one of the operands.
385
/// \param __b
386
///    A 128-bit vector of [4 x float] containing one of the operands.
387
/// \returns A 128-bit vector of [4 x float] containing the maximum values
388
///    between both operands.
389
static __inline__ __m128 __DEFAULT_FN_ATTRS
390
_mm_max_ps(__m128 __a, __m128 __b)
391
{
392
  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
393
}
394
 
395
/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
396
///
397
/// \headerfile <x86intrin.h>
398
///
399
/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
400
///
401
/// \param __a
402
///    A 128-bit vector containing one of the source operands.
403
/// \param __b
404
///    A 128-bit vector containing one of the source operands.
405
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
406
///    values between both operands.
407
static __inline__ __m128 __DEFAULT_FN_ATTRS
408
_mm_and_ps(__m128 __a, __m128 __b)
409
{
410
  return (__m128)((__v4su)__a & (__v4su)__b);
411
}
412
 
413
/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
414
///    the one's complement of the values contained in the first source
415
///    operand.
416
///
417
/// \headerfile <x86intrin.h>
418
///
419
/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
420
///
421
/// \param __a
422
///    A 128-bit vector of [4 x float] containing the first source operand. The
423
///    one's complement of this value is used in the bitwise AND.
424
/// \param __b
425
///    A 128-bit vector of [4 x float] containing the second source operand.
426
/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
427
///    one's complement of the first operand and the values in the second
428
///    operand.
429
static __inline__ __m128 __DEFAULT_FN_ATTRS
430
_mm_andnot_ps(__m128 __a, __m128 __b)
431
{
432
  return (__m128)(~(__v4su)__a & (__v4su)__b);
433
}
434
 
435
/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
436
///
437
/// \headerfile <x86intrin.h>
438
///
439
/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
440
///
441
/// \param __a
442
///    A 128-bit vector of [4 x float] containing one of the source operands.
443
/// \param __b
444
///    A 128-bit vector of [4 x float] containing one of the source operands.
445
/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
446
///    values between both operands.
447
static __inline__ __m128 __DEFAULT_FN_ATTRS
448
_mm_or_ps(__m128 __a, __m128 __b)
449
{
450
  return (__m128)((__v4su)__a | (__v4su)__b);
451
}
452
 
453
/// Performs a bitwise exclusive OR of two 128-bit vectors of
454
///    [4 x float].
455
///
456
/// \headerfile <x86intrin.h>
457
///
458
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
459
///
460
/// \param __a
461
///    A 128-bit vector of [4 x float] containing one of the source operands.
462
/// \param __b
463
///    A 128-bit vector of [4 x float] containing one of the source operands.
464
/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
465
///    of the values between both operands.
466
static __inline__ __m128 __DEFAULT_FN_ATTRS
467
_mm_xor_ps(__m128 __a, __m128 __b)
468
{
469
  return (__m128)((__v4su)__a ^ (__v4su)__b);
470
}
471
 
472
/// Compares two 32-bit float values in the low-order bits of both
473
///    operands for equality and returns the result of the comparison in the
474
///    low-order bits of a vector [4 x float].
475
///
476
/// \headerfile <x86intrin.h>
477
///
478
/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
479
///
480
/// \param __a
481
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
482
///    32 bits of this operand are used in the comparison.
483
/// \param __b
484
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
485
///    32 bits of this operand are used in the comparison.
486
/// \returns A 128-bit vector of [4 x float] containing the comparison results
487
///    in the low-order bits.
488
static __inline__ __m128 __DEFAULT_FN_ATTRS
489
_mm_cmpeq_ss(__m128 __a, __m128 __b)
490
{
491
  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
492
}
493
 
494
/// Compares each of the corresponding 32-bit float values of the
495
///    128-bit vectors of [4 x float] for equality.
496
///
497
/// \headerfile <x86intrin.h>
498
///
499
/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
500
///
501
/// \param __a
502
///    A 128-bit vector of [4 x float].
503
/// \param __b
504
///    A 128-bit vector of [4 x float].
505
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
506
static __inline__ __m128 __DEFAULT_FN_ATTRS
507
_mm_cmpeq_ps(__m128 __a, __m128 __b)
508
{
509
  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
510
}
511
 
512
/// Compares two 32-bit float values in the low-order bits of both
513
///    operands to determine if the value in the first operand is less than the
514
///    corresponding value in the second operand and returns the result of the
515
///    comparison in the low-order bits of a vector of [4 x float].
516
///
517
/// \headerfile <x86intrin.h>
518
///
519
/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
520
///
521
/// \param __a
522
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
523
///    32 bits of this operand are used in the comparison.
524
/// \param __b
525
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
526
///    32 bits of this operand are used in the comparison.
527
/// \returns A 128-bit vector of [4 x float] containing the comparison results
528
///    in the low-order bits.
529
static __inline__ __m128 __DEFAULT_FN_ATTRS
530
_mm_cmplt_ss(__m128 __a, __m128 __b)
531
{
532
  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
533
}
534
 
535
/// Compares each of the corresponding 32-bit float values of the
536
///    128-bit vectors of [4 x float] to determine if the values in the first
537
///    operand are less than those in the second operand.
538
///
539
/// \headerfile <x86intrin.h>
540
///
541
/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
542
///
543
/// \param __a
544
///    A 128-bit vector of [4 x float].
545
/// \param __b
546
///    A 128-bit vector of [4 x float].
547
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
548
static __inline__ __m128 __DEFAULT_FN_ATTRS
549
_mm_cmplt_ps(__m128 __a, __m128 __b)
550
{
551
  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
552
}
553
 
554
/// Compares two 32-bit float values in the low-order bits of both
555
///    operands to determine if the value in the first operand is less than or
556
///    equal to the corresponding value in the second operand and returns the
557
///    result of the comparison in the low-order bits of a vector of
558
///    [4 x float].
559
///
560
/// \headerfile <x86intrin.h>
561
///
562
/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
563
///
564
/// \param __a
565
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
566
///    32 bits of this operand are used in the comparison.
567
/// \param __b
568
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
569
///    32 bits of this operand are used in the comparison.
570
/// \returns A 128-bit vector of [4 x float] containing the comparison results
571
///    in the low-order bits.
572
static __inline__ __m128 __DEFAULT_FN_ATTRS
573
_mm_cmple_ss(__m128 __a, __m128 __b)
574
{
575
  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
576
}
577
 
578
/// Compares each of the corresponding 32-bit float values of the
579
///    128-bit vectors of [4 x float] to determine if the values in the first
580
///    operand are less than or equal to those in the second operand.
581
///
582
/// \headerfile <x86intrin.h>
583
///
584
/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
585
///
586
/// \param __a
587
///    A 128-bit vector of [4 x float].
588
/// \param __b
589
///    A 128-bit vector of [4 x float].
590
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
591
static __inline__ __m128 __DEFAULT_FN_ATTRS
592
_mm_cmple_ps(__m128 __a, __m128 __b)
593
{
594
  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
595
}
596
 
597
/// Compares two 32-bit float values in the low-order bits of both
598
///    operands to determine if the value in the first operand is greater than
599
///    the corresponding value in the second operand and returns the result of
600
///    the comparison in the low-order bits of a vector of [4 x float].
601
///
602
/// \headerfile <x86intrin.h>
603
///
604
/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
605
///
606
/// \param __a
607
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
608
///    32 bits of this operand are used in the comparison.
609
/// \param __b
610
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
611
///    32 bits of this operand are used in the comparison.
612
/// \returns A 128-bit vector of [4 x float] containing the comparison results
613
///    in the low-order bits.
614
static __inline__ __m128 __DEFAULT_FN_ATTRS
615
_mm_cmpgt_ss(__m128 __a, __m128 __b)
616
{
617
  return (__m128)__builtin_shufflevector((__v4sf)__a,
618
                                         (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
619
                                         4, 1, 2, 3);
620
}
621
 
622
/// Compares each of the corresponding 32-bit float values of the
623
///    128-bit vectors of [4 x float] to determine if the values in the first
624
///    operand are greater than those in the second operand.
625
///
626
/// \headerfile <x86intrin.h>
627
///
628
/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
629
///
630
/// \param __a
631
///    A 128-bit vector of [4 x float].
632
/// \param __b
633
///    A 128-bit vector of [4 x float].
634
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
635
static __inline__ __m128 __DEFAULT_FN_ATTRS
636
_mm_cmpgt_ps(__m128 __a, __m128 __b)
637
{
638
  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
639
}
640
 
641
/// Compares two 32-bit float values in the low-order bits of both
642
///    operands to determine if the value in the first operand is greater than
643
///    or equal to the corresponding value in the second operand and returns
644
///    the result of the comparison in the low-order bits of a vector of
645
///    [4 x float].
646
///
647
/// \headerfile <x86intrin.h>
648
///
649
/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
650
///
651
/// \param __a
652
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
653
///    32 bits of this operand are used in the comparison.
654
/// \param __b
655
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
656
///    32 bits of this operand are used in the comparison.
657
/// \returns A 128-bit vector of [4 x float] containing the comparison results
658
///    in the low-order bits.
659
static __inline__ __m128 __DEFAULT_FN_ATTRS
660
_mm_cmpge_ss(__m128 __a, __m128 __b)
661
{
662
  return (__m128)__builtin_shufflevector((__v4sf)__a,
663
                                         (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
664
                                         4, 1, 2, 3);
665
}
666
 
667
/// Compares each of the corresponding 32-bit float values of the
668
///    128-bit vectors of [4 x float] to determine if the values in the first
669
///    operand are greater than or equal to those in the second operand.
670
///
671
/// \headerfile <x86intrin.h>
672
///
673
/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
674
///
675
/// \param __a
676
///    A 128-bit vector of [4 x float].
677
/// \param __b
678
///    A 128-bit vector of [4 x float].
679
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
680
static __inline__ __m128 __DEFAULT_FN_ATTRS
681
_mm_cmpge_ps(__m128 __a, __m128 __b)
682
{
683
  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
684
}
685
 
686
/// Compares two 32-bit float values in the low-order bits of both
687
///    operands for inequality and returns the result of the comparison in the
688
///    low-order bits of a vector of [4 x float].
689
///
690
/// \headerfile <x86intrin.h>
691
///
692
/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
693
///   instructions.
694
///
695
/// \param __a
696
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
697
///    32 bits of this operand are used in the comparison.
698
/// \param __b
699
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
700
///    32 bits of this operand are used in the comparison.
701
/// \returns A 128-bit vector of [4 x float] containing the comparison results
702
///    in the low-order bits.
703
static __inline__ __m128 __DEFAULT_FN_ATTRS
704
_mm_cmpneq_ss(__m128 __a, __m128 __b)
705
{
706
  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
707
}
708
 
709
/// Compares each of the corresponding 32-bit float values of the
710
///    128-bit vectors of [4 x float] for inequality.
711
///
712
/// \headerfile <x86intrin.h>
713
///
714
/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
715
///   instructions.
716
///
717
/// \param __a
718
///    A 128-bit vector of [4 x float].
719
/// \param __b
720
///    A 128-bit vector of [4 x float].
721
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
722
static __inline__ __m128 __DEFAULT_FN_ATTRS
723
_mm_cmpneq_ps(__m128 __a, __m128 __b)
724
{
725
  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
726
}
727
 
728
/// Compares two 32-bit float values in the low-order bits of both
729
///    operands to determine if the value in the first operand is not less than
730
///    the corresponding value in the second operand and returns the result of
731
///    the comparison in the low-order bits of a vector of [4 x float].
732
///
733
/// \headerfile <x86intrin.h>
734
///
735
/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
736
///   instructions.
737
///
738
/// \param __a
739
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
740
///    32 bits of this operand are used in the comparison.
741
/// \param __b
742
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
743
///    32 bits of this operand are used in the comparison.
744
/// \returns A 128-bit vector of [4 x float] containing the comparison results
745
///    in the low-order bits.
746
static __inline__ __m128 __DEFAULT_FN_ATTRS
747
_mm_cmpnlt_ss(__m128 __a, __m128 __b)
748
{
749
  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
750
}
751
 
752
/// Compares each of the corresponding 32-bit float values of the
753
///    128-bit vectors of [4 x float] to determine if the values in the first
754
///    operand are not less than those in the second operand.
755
///
756
/// \headerfile <x86intrin.h>
757
///
758
/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
759
///   instructions.
760
///
761
/// \param __a
762
///    A 128-bit vector of [4 x float].
763
/// \param __b
764
///    A 128-bit vector of [4 x float].
765
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
766
static __inline__ __m128 __DEFAULT_FN_ATTRS
767
_mm_cmpnlt_ps(__m128 __a, __m128 __b)
768
{
769
  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
770
}
771
 
772
/// Compares two 32-bit float values in the low-order bits of both
773
///    operands to determine if the value in the first operand is not less than
774
///    or equal to the corresponding value in the second operand and returns
775
///    the result of the comparison in the low-order bits of a vector of
776
///    [4 x float].
777
///
778
/// \headerfile <x86intrin.h>
779
///
780
/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
781
///   instructions.
782
///
783
/// \param __a
784
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
785
///    32 bits of this operand are used in the comparison.
786
/// \param __b
787
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
788
///    32 bits of this operand are used in the comparison.
789
/// \returns A 128-bit vector of [4 x float] containing the comparison results
790
///    in the low-order bits.
791
static __inline__ __m128 __DEFAULT_FN_ATTRS
792
_mm_cmpnle_ss(__m128 __a, __m128 __b)
793
{
794
  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
795
}
796
 
797
/// Compares each of the corresponding 32-bit float values of the
798
///    128-bit vectors of [4 x float] to determine if the values in the first
799
///    operand are not less than or equal to those in the second operand.
800
///
801
/// \headerfile <x86intrin.h>
802
///
803
/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
804
///   instructions.
805
///
806
/// \param __a
807
///    A 128-bit vector of [4 x float].
808
/// \param __b
809
///    A 128-bit vector of [4 x float].
810
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
811
static __inline__ __m128 __DEFAULT_FN_ATTRS
812
_mm_cmpnle_ps(__m128 __a, __m128 __b)
813
{
814
  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
815
}
816
 
817
/// Compares two 32-bit float values in the low-order bits of both
818
///    operands to determine if the value in the first operand is not greater
819
///    than the corresponding value in the second operand and returns the
820
///    result of the comparison in the low-order bits of a vector of
821
///    [4 x float].
822
///
823
/// \headerfile <x86intrin.h>
824
///
825
/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
826
///   instructions.
827
///
828
/// \param __a
829
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
830
///    32 bits of this operand are used in the comparison.
831
/// \param __b
832
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
833
///    32 bits of this operand are used in the comparison.
834
/// \returns A 128-bit vector of [4 x float] containing the comparison results
835
///    in the low-order bits.
836
static __inline__ __m128 __DEFAULT_FN_ATTRS
837
_mm_cmpngt_ss(__m128 __a, __m128 __b)
838
{
839
  return (__m128)__builtin_shufflevector((__v4sf)__a,
840
                                         (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
841
                                         4, 1, 2, 3);
842
}
843
 
844
/// Compares each of the corresponding 32-bit float values of the
845
///    128-bit vectors of [4 x float] to determine if the values in the first
846
///    operand are not greater than those in the second operand.
847
///
848
/// \headerfile <x86intrin.h>
849
///
850
/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
851
///   instructions.
852
///
853
/// \param __a
854
///    A 128-bit vector of [4 x float].
855
/// \param __b
856
///    A 128-bit vector of [4 x float].
857
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
858
static __inline__ __m128 __DEFAULT_FN_ATTRS
859
_mm_cmpngt_ps(__m128 __a, __m128 __b)
860
{
861
  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
862
}
863
 
864
/// Compares two 32-bit float values in the low-order bits of both
865
///    operands to determine if the value in the first operand is not greater
866
///    than or equal to the corresponding value in the second operand and
867
///    returns the result of the comparison in the low-order bits of a vector
868
///    of [4 x float].
869
///
870
/// \headerfile <x86intrin.h>
871
///
872
/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
873
///   instructions.
874
///
875
/// \param __a
876
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
877
///    32 bits of this operand are used in the comparison.
878
/// \param __b
879
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
880
///    32 bits of this operand are used in the comparison.
881
/// \returns A 128-bit vector of [4 x float] containing the comparison results
882
///    in the low-order bits.
883
static __inline__ __m128 __DEFAULT_FN_ATTRS
884
_mm_cmpnge_ss(__m128 __a, __m128 __b)
885
{
886
  return (__m128)__builtin_shufflevector((__v4sf)__a,
887
                                         (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
888
                                         4, 1, 2, 3);
889
}
890
 
891
/// Compares each of the corresponding 32-bit float values of the
892
///    128-bit vectors of [4 x float] to determine if the values in the first
893
///    operand are not greater than or equal to those in the second operand.
894
///
895
/// \headerfile <x86intrin.h>
896
///
897
/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
898
///   instructions.
899
///
900
/// \param __a
901
///    A 128-bit vector of [4 x float].
902
/// \param __b
903
///    A 128-bit vector of [4 x float].
904
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
905
static __inline__ __m128 __DEFAULT_FN_ATTRS
906
_mm_cmpnge_ps(__m128 __a, __m128 __b)
907
{
908
  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
909
}
910
 
911
/// Compares two 32-bit float values in the low-order bits of both
912
///    operands to determine if the value in the first operand is ordered with
913
///    respect to the corresponding value in the second operand and returns the
914
///    result of the comparison in the low-order bits of a vector of
915
///    [4 x float].
916
///
917
/// \headerfile <x86intrin.h>
918
///
919
/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
920
///   instructions.
921
///
922
/// \param __a
923
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
924
///    32 bits of this operand are used in the comparison.
925
/// \param __b
926
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
927
///    32 bits of this operand are used in the comparison.
928
/// \returns A 128-bit vector of [4 x float] containing the comparison results
929
///    in the low-order bits.
930
static __inline__ __m128 __DEFAULT_FN_ATTRS
931
_mm_cmpord_ss(__m128 __a, __m128 __b)
932
{
933
  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
934
}
935
 
936
/// Compares each of the corresponding 32-bit float values of the
937
///    128-bit vectors of [4 x float] to determine if the values in the first
938
///    operand are ordered with respect to those in the second operand.
939
///
940
/// \headerfile <x86intrin.h>
941
///
942
/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
943
///   instructions.
944
///
945
/// \param __a
946
///    A 128-bit vector of [4 x float].
947
/// \param __b
948
///    A 128-bit vector of [4 x float].
949
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
950
static __inline__ __m128 __DEFAULT_FN_ATTRS
951
_mm_cmpord_ps(__m128 __a, __m128 __b)
952
{
953
  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
954
}
955
 
956
/// Compares two 32-bit float values in the low-order bits of both
957
///    operands to determine if the value in the first operand is unordered
958
///    with respect to the corresponding value in the second operand and
959
///    returns the result of the comparison in the low-order bits of a vector
960
///    of [4 x float].
961
///
962
/// \headerfile <x86intrin.h>
963
///
964
/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
965
///   instructions.
966
///
967
/// \param __a
968
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
969
///    32 bits of this operand are used in the comparison.
970
/// \param __b
971
///    A 128-bit vector of [4 x float] containing one of the operands. The lower
972
///    32 bits of this operand are used in the comparison.
973
/// \returns A 128-bit vector of [4 x float] containing the comparison results
974
///    in the low-order bits.
975
static __inline__ __m128 __DEFAULT_FN_ATTRS
976
_mm_cmpunord_ss(__m128 __a, __m128 __b)
977
{
978
  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
979
}
980
 
981
/// Compares each of the corresponding 32-bit float values of the
982
///    128-bit vectors of [4 x float] to determine if the values in the first
983
///    operand are unordered with respect to those in the second operand.
984
///
985
/// \headerfile <x86intrin.h>
986
///
987
/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
988
///   instructions.
989
///
990
/// \param __a
991
///    A 128-bit vector of [4 x float].
992
/// \param __b
993
///    A 128-bit vector of [4 x float].
994
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
995
static __inline__ __m128 __DEFAULT_FN_ATTRS
996
_mm_cmpunord_ps(__m128 __a, __m128 __b)
997
{
998
  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
999
}
1000
 
1001
/// Compares two 32-bit float values in the low-order bits of both
1002
///    operands for equality and returns the result of the comparison.
1003
///
1004
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1005
///
1006
/// \headerfile <x86intrin.h>
1007
///
1008
/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1009
///   instructions.
1010
///
1011
/// \param __a
1012
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1013
///    used in the comparison.
1014
/// \param __b
1015
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1016
///    used in the comparison.
1017
/// \returns An integer containing the comparison results. If either of the
1018
///    two lower 32-bit values is NaN, 0 is returned.
1019
static __inline__ int __DEFAULT_FN_ATTRS
1020
_mm_comieq_ss(__m128 __a, __m128 __b)
1021
{
1022
  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1023
}
1024
 
1025
/// Compares two 32-bit float values in the low-order bits of both
1026
///    operands to determine if the first operand is less than the second
1027
///    operand and returns the result of the comparison.
1028
///
1029
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1030
///
1031
/// \headerfile <x86intrin.h>
1032
///
1033
/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1034
///   instructions.
1035
///
1036
/// \param __a
1037
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1038
///    used in the comparison.
1039
/// \param __b
1040
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1041
///    used in the comparison.
1042
/// \returns An integer containing the comparison results. If either of the two
1043
///     lower 32-bit values is NaN, 0 is returned.
1044
static __inline__ int __DEFAULT_FN_ATTRS
1045
_mm_comilt_ss(__m128 __a, __m128 __b)
1046
{
1047
  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1048
}
1049
 
1050
/// Compares two 32-bit float values in the low-order bits of both
1051
///    operands to determine if the first operand is less than or equal to the
1052
///    second operand and returns the result of the comparison.
1053
///
1054
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1055
///
1056
/// \headerfile <x86intrin.h>
1057
///
1058
/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1059
///
1060
/// \param __a
1061
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1062
///    used in the comparison.
1063
/// \param __b
1064
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1065
///    used in the comparison.
1066
/// \returns An integer containing the comparison results. If either of the two
1067
///     lower 32-bit values is NaN, 0 is returned.
1068
static __inline__ int __DEFAULT_FN_ATTRS
1069
_mm_comile_ss(__m128 __a, __m128 __b)
1070
{
1071
  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1072
}
1073
 
1074
/// Compares two 32-bit float values in the low-order bits of both
1075
///    operands to determine if the first operand is greater than the second
1076
///    operand and returns the result of the comparison.
1077
///
1078
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1079
///
1080
/// \headerfile <x86intrin.h>
1081
///
1082
/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1083
///
1084
/// \param __a
1085
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1086
///    used in the comparison.
1087
/// \param __b
1088
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1089
///    used in the comparison.
1090
/// \returns An integer containing the comparison results. If either of the
1091
///     two lower 32-bit values is NaN, 0 is returned.
1092
static __inline__ int __DEFAULT_FN_ATTRS
1093
_mm_comigt_ss(__m128 __a, __m128 __b)
1094
{
1095
  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1096
}
1097
 
1098
/// Compares two 32-bit float values in the low-order bits of both
1099
///    operands to determine if the first operand is greater than or equal to
1100
///    the second operand and returns the result of the comparison.
1101
///
1102
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1103
///
1104
/// \headerfile <x86intrin.h>
1105
///
1106
/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1107
///
1108
/// \param __a
1109
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1110
///    used in the comparison.
1111
/// \param __b
1112
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1113
///    used in the comparison.
1114
/// \returns An integer containing the comparison results. If either of the two
1115
///    lower 32-bit values is NaN, 0 is returned.
1116
static __inline__ int __DEFAULT_FN_ATTRS
1117
_mm_comige_ss(__m128 __a, __m128 __b)
1118
{
1119
  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1120
}
1121
 
1122
/// Compares two 32-bit float values in the low-order bits of both
1123
///    operands to determine if the first operand is not equal to the second
1124
///    operand and returns the result of the comparison.
1125
///
1126
///    If either of the two lower 32-bit values is NaN, 1 is returned.
1127
///
1128
/// \headerfile <x86intrin.h>
1129
///
1130
/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1131
///
1132
/// \param __a
1133
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1134
///    used in the comparison.
1135
/// \param __b
1136
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1137
///    used in the comparison.
1138
/// \returns An integer containing the comparison results. If either of the
1139
///     two lower 32-bit values is NaN, 1 is returned.
1140
static __inline__ int __DEFAULT_FN_ATTRS
1141
_mm_comineq_ss(__m128 __a, __m128 __b)
1142
{
1143
  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1144
}
1145
 
1146
/// Performs an unordered comparison of two 32-bit float values using
1147
///    the low-order bits of both operands to determine equality and returns
1148
///    the result of the comparison.
1149
///
1150
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1151
///
1152
/// \headerfile <x86intrin.h>
1153
///
1154
/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1155
///
1156
/// \param __a
1157
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1158
///    used in the comparison.
1159
/// \param __b
1160
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1161
///    used in the comparison.
1162
/// \returns An integer containing the comparison results. If either of the two
1163
///     lower 32-bit values is NaN, 0 is returned.
1164
static __inline__ int __DEFAULT_FN_ATTRS
1165
_mm_ucomieq_ss(__m128 __a, __m128 __b)
1166
{
1167
  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1168
}
1169
 
1170
/// Performs an unordered comparison of two 32-bit float values using
1171
///    the low-order bits of both operands to determine if the first operand is
1172
///    less than the second operand and returns the result of the comparison.
1173
///
1174
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1175
///
1176
/// \headerfile <x86intrin.h>
1177
///
1178
/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1179
///
1180
/// \param __a
1181
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1182
///    used in the comparison.
1183
/// \param __b
1184
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1185
///    used in the comparison.
1186
/// \returns An integer containing the comparison results. If either of the two
1187
///    lower 32-bit values is NaN, 0 is returned.
1188
static __inline__ int __DEFAULT_FN_ATTRS
1189
_mm_ucomilt_ss(__m128 __a, __m128 __b)
1190
{
1191
  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1192
}
1193
 
1194
/// Performs an unordered comparison of two 32-bit float values using
1195
///    the low-order bits of both operands to determine if the first operand is
1196
///    less than or equal to the second operand and returns the result of the
1197
///    comparison.
1198
///
1199
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1200
///
1201
/// \headerfile <x86intrin.h>
1202
///
1203
/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1204
///
1205
/// \param __a
1206
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1207
///    used in the comparison.
1208
/// \param __b
1209
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1210
///    used in the comparison.
1211
/// \returns An integer containing the comparison results. If either of the two
1212
///     lower 32-bit values is NaN, 0 is returned.
1213
static __inline__ int __DEFAULT_FN_ATTRS
1214
_mm_ucomile_ss(__m128 __a, __m128 __b)
1215
{
1216
  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1217
}
1218
 
1219
/// Performs an unordered comparison of two 32-bit float values using
1220
///    the low-order bits of both operands to determine if the first operand is
1221
///    greater than the second operand and returns the result of the
1222
///    comparison.
1223
///
1224
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1225
///
1226
/// \headerfile <x86intrin.h>
1227
///
1228
/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1229
///
1230
/// \param __a
1231
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1232
///    used in the comparison.
1233
/// \param __b
1234
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1235
///    used in the comparison.
1236
/// \returns An integer containing the comparison results. If either of the two
1237
///     lower 32-bit values is NaN, 0 is returned.
1238
static __inline__ int __DEFAULT_FN_ATTRS
1239
_mm_ucomigt_ss(__m128 __a, __m128 __b)
1240
{
1241
  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1242
}
1243
 
1244
/// Performs an unordered comparison of two 32-bit float values using
1245
///    the low-order bits of both operands to determine if the first operand is
1246
///    greater than or equal to the second operand and returns the result of
1247
///    the comparison.
1248
///
1249
///    If either of the two lower 32-bit values is NaN, 0 is returned.
1250
///
1251
/// \headerfile <x86intrin.h>
1252
///
1253
/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1254
///
1255
/// \param __a
1256
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1257
///    used in the comparison.
1258
/// \param __b
1259
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1260
///    used in the comparison.
1261
/// \returns An integer containing the comparison results. If either of the two
1262
///     lower 32-bit values is NaN, 0 is returned.
1263
static __inline__ int __DEFAULT_FN_ATTRS
1264
_mm_ucomige_ss(__m128 __a, __m128 __b)
1265
{
1266
  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1267
}
1268
 
1269
/// Performs an unordered comparison of two 32-bit float values using
1270
///    the low-order bits of both operands to determine inequality and returns
1271
///    the result of the comparison.
1272
///
1273
///    If either of the two lower 32-bit values is NaN, 1 is returned.
1274
///
1275
/// \headerfile <x86intrin.h>
1276
///
1277
/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1278
///
1279
/// \param __a
1280
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1281
///    used in the comparison.
1282
/// \param __b
1283
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1284
///    used in the comparison.
1285
/// \returns An integer containing the comparison results. If either of the two
1286
///    lower 32-bit values is NaN, 1 is returned.
1287
static __inline__ int __DEFAULT_FN_ATTRS
1288
_mm_ucomineq_ss(__m128 __a, __m128 __b)
1289
{
1290
  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1291
}
1292
 
1293
/// Converts a float value contained in the lower 32 bits of a vector of
1294
///    [4 x float] into a 32-bit integer.
1295
///
1296
/// \headerfile <x86intrin.h>
1297
///
1298
/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1299
///   instructions.
1300
///
1301
/// \param __a
1302
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1303
///    used in the conversion.
1304
/// \returns A 32-bit integer containing the converted value.
1305
static __inline__ int __DEFAULT_FN_ATTRS
1306
_mm_cvtss_si32(__m128 __a)
1307
{
1308
  return __builtin_ia32_cvtss2si((__v4sf)__a);
1309
}
1310
 
1311
/// Converts a float value contained in the lower 32 bits of a vector of
1312
///    [4 x float] into a 32-bit integer.
1313
///
1314
/// \headerfile <x86intrin.h>
1315
///
1316
/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1317
///   instructions.
1318
///
1319
/// \param __a
1320
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1321
///    used in the conversion.
1322
/// \returns A 32-bit integer containing the converted value.
1323
static __inline__ int __DEFAULT_FN_ATTRS
1324
_mm_cvt_ss2si(__m128 __a)
1325
{
1326
  return _mm_cvtss_si32(__a);
1327
}
1328
 
1329
#ifdef __x86_64__
1330
 
1331
/// Converts a float value contained in the lower 32 bits of a vector of
1332
///    [4 x float] into a 64-bit integer.
1333
///
1334
/// \headerfile <x86intrin.h>
1335
///
1336
/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1337
///   instructions.
1338
///
1339
/// \param __a
1340
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1341
///    used in the conversion.
1342
/// \returns A 64-bit integer containing the converted value.
1343
static __inline__ long long __DEFAULT_FN_ATTRS
1344
_mm_cvtss_si64(__m128 __a)
1345
{
1346
  return __builtin_ia32_cvtss2si64((__v4sf)__a);
1347
}
1348
 
1349
#endif
1350
 
1351
/// Converts two low-order float values in a 128-bit vector of
1352
///    [4 x float] into a 64-bit vector of [2 x i32].
1353
///
1354
/// \headerfile <x86intrin.h>
1355
///
1356
/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1357
///
1358
/// \param __a
1359
///    A 128-bit vector of [4 x float].
1360
/// \returns A 64-bit integer vector containing the converted values.
1361
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1362
_mm_cvtps_pi32(__m128 __a)
1363
{
1364
  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1365
}
1366
 
1367
/// Converts two low-order float values in a 128-bit vector of
1368
///    [4 x float] into a 64-bit vector of [2 x i32].
1369
///
1370
/// \headerfile <x86intrin.h>
1371
///
1372
/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1373
///
1374
/// \param __a
1375
///    A 128-bit vector of [4 x float].
1376
/// \returns A 64-bit integer vector containing the converted values.
1377
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1378
_mm_cvt_ps2pi(__m128 __a)
1379
{
1380
  return _mm_cvtps_pi32(__a);
1381
}
1382
 
1383
/// Converts a float value contained in the lower 32 bits of a vector of
1384
///    [4 x float] into a 32-bit integer, truncating the result when it is
1385
///    inexact.
1386
///
1387
/// \headerfile <x86intrin.h>
1388
///
1389
/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1390
///   instructions.
1391
///
1392
/// \param __a
1393
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1394
///    used in the conversion.
1395
/// \returns A 32-bit integer containing the converted value.
1396
static __inline__ int __DEFAULT_FN_ATTRS
1397
_mm_cvttss_si32(__m128 __a)
1398
{
1399
  return __builtin_ia32_cvttss2si((__v4sf)__a);
1400
}
1401
 
1402
/// Converts a float value contained in the lower 32 bits of a vector of
1403
///    [4 x float] into a 32-bit integer, truncating the result when it is
1404
///    inexact.
1405
///
1406
/// \headerfile <x86intrin.h>
1407
///
1408
/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1409
///   instructions.
1410
///
1411
/// \param __a
1412
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1413
///    used in the conversion.
1414
/// \returns A 32-bit integer containing the converted value.
1415
static __inline__ int __DEFAULT_FN_ATTRS
1416
_mm_cvtt_ss2si(__m128 __a)
1417
{
1418
  return _mm_cvttss_si32(__a);
1419
}
1420
 
1421
#ifdef __x86_64__
1422
/// Converts a float value contained in the lower 32 bits of a vector of
1423
///    [4 x float] into a 64-bit integer, truncating the result when it is
1424
///    inexact.
1425
///
1426
/// \headerfile <x86intrin.h>
1427
///
1428
/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1429
///   instructions.
1430
///
1431
/// \param __a
1432
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1433
///    used in the conversion.
1434
/// \returns A 64-bit integer containing the converted value.
1435
static __inline__ long long __DEFAULT_FN_ATTRS
1436
_mm_cvttss_si64(__m128 __a)
1437
{
1438
  return __builtin_ia32_cvttss2si64((__v4sf)__a);
1439
}
1440
#endif
1441
 
1442
/// Converts two low-order float values in a 128-bit vector of
1443
///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1444
///    when it is inexact.
1445
///
1446
/// \headerfile <x86intrin.h>
1447
///
1448
/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1449
///   instructions.
1450
///
1451
/// \param __a
1452
///    A 128-bit vector of [4 x float].
1453
/// \returns A 64-bit integer vector containing the converted values.
1454
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1455
_mm_cvttps_pi32(__m128 __a)
1456
{
1457
  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1458
}
1459
 
1460
/// Converts two low-order float values in a 128-bit vector of [4 x
1461
///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1462
///    is inexact.
1463
///
1464
/// \headerfile <x86intrin.h>
1465
///
1466
/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1467
///
1468
/// \param __a
1469
///    A 128-bit vector of [4 x float].
1470
/// \returns A 64-bit integer vector containing the converted values.
1471
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1472
_mm_cvtt_ps2pi(__m128 __a)
1473
{
1474
  return _mm_cvttps_pi32(__a);
1475
}
1476
 
1477
/// Converts a 32-bit signed integer value into a floating point value
1478
///    and writes it to the lower 32 bits of the destination. The remaining
1479
///    higher order elements of the destination vector are copied from the
1480
///    corresponding elements in the first operand.
1481
///
1482
/// \headerfile <x86intrin.h>
1483
///
1484
/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1485
///
1486
/// \param __a
1487
///    A 128-bit vector of [4 x float].
1488
/// \param __b
1489
///    A 32-bit signed integer operand containing the value to be converted.
1490
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1491
///    converted value of the second operand. The upper 96 bits are copied from
1492
///    the upper 96 bits of the first operand.
1493
static __inline__ __m128 __DEFAULT_FN_ATTRS
1494
_mm_cvtsi32_ss(__m128 __a, int __b)
1495
{
1496
  __a[0] = __b;
1497
  return __a;
1498
}
1499
 
1500
/// Converts a 32-bit signed integer value into a floating point value
1501
///    and writes it to the lower 32 bits of the destination. The remaining
1502
///    higher order elements of the destination are copied from the
1503
///    corresponding elements in the first operand.
1504
///
1505
/// \headerfile <x86intrin.h>
1506
///
1507
/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1508
///
1509
/// \param __a
1510
///    A 128-bit vector of [4 x float].
1511
/// \param __b
1512
///    A 32-bit signed integer operand containing the value to be converted.
1513
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1514
///    converted value of the second operand. The upper 96 bits are copied from
1515
///    the upper 96 bits of the first operand.
1516
static __inline__ __m128 __DEFAULT_FN_ATTRS
1517
_mm_cvt_si2ss(__m128 __a, int __b)
1518
{
1519
  return _mm_cvtsi32_ss(__a, __b);
1520
}
1521
 
1522
#ifdef __x86_64__
1523
 
1524
/// Converts a 64-bit signed integer value into a floating point value
1525
///    and writes it to the lower 32 bits of the destination. The remaining
1526
///    higher order elements of the destination are copied from the
1527
///    corresponding elements in the first operand.
1528
///
1529
/// \headerfile <x86intrin.h>
1530
///
1531
/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1532
///
1533
/// \param __a
1534
///    A 128-bit vector of [4 x float].
1535
/// \param __b
1536
///    A 64-bit signed integer operand containing the value to be converted.
1537
/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1538
///    converted value of the second operand. The upper 96 bits are copied from
1539
///    the upper 96 bits of the first operand.
1540
static __inline__ __m128 __DEFAULT_FN_ATTRS
1541
_mm_cvtsi64_ss(__m128 __a, long long __b)
1542
{
1543
  __a[0] = __b;
1544
  return __a;
1545
}
1546
 
1547
#endif
1548
 
1549
/// Converts two elements of a 64-bit vector of [2 x i32] into two
1550
///    floating point values and writes them to the lower 64-bits of the
1551
///    destination. The remaining higher order elements of the destination are
1552
///    copied from the corresponding elements in the first operand.
1553
///
1554
/// \headerfile <x86intrin.h>
1555
///
1556
/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1557
///
1558
/// \param __a
1559
///    A 128-bit vector of [4 x float].
1560
/// \param __b
1561
///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1562
///    and written to the corresponding low-order elements in the destination.
1563
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1564
///    converted value of the second operand. The upper 64 bits are copied from
1565
///    the upper 64 bits of the first operand.
1566
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1567
_mm_cvtpi32_ps(__m128 __a, __m64 __b)
1568
{
1569
  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1570
}
1571
 
1572
/// Converts two elements of a 64-bit vector of [2 x i32] into two
1573
///    floating point values and writes them to the lower 64-bits of the
1574
///    destination. The remaining higher order elements of the destination are
1575
///    copied from the corresponding elements in the first operand.
1576
///
1577
/// \headerfile <x86intrin.h>
1578
///
1579
/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1580
///
1581
/// \param __a
1582
///    A 128-bit vector of [4 x float].
1583
/// \param __b
1584
///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1585
///    and written to the corresponding low-order elements in the destination.
1586
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1587
///    converted value from the second operand. The upper 64 bits are copied
1588
///    from the upper 64 bits of the first operand.
1589
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1590
_mm_cvt_pi2ps(__m128 __a, __m64 __b)
1591
{
1592
  return _mm_cvtpi32_ps(__a, __b);
1593
}
1594
 
1595
/// Extracts a float value contained in the lower 32 bits of a vector of
1596
///    [4 x float].
1597
///
1598
/// \headerfile <x86intrin.h>
1599
///
1600
/// This intrinsic has no corresponding instruction.
1601
///
1602
/// \param __a
1603
///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1604
///    used in the extraction.
1605
/// \returns A 32-bit float containing the extracted value.
1606
static __inline__ float __DEFAULT_FN_ATTRS
1607
_mm_cvtss_f32(__m128 __a)
1608
{
1609
  return __a[0];
1610
}
1611
 
1612
/// Loads two packed float values from the address \a __p into the
1613
///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1614
///     are copied from the low-order bits of the first operand.
1615
///
1616
/// \headerfile <x86intrin.h>
1617
///
1618
/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1619
///
1620
/// \param __a
1621
///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1622
///    of the destination.
1623
/// \param __p
1624
///    A pointer to two packed float values. Bits [63:0] are written to bits
1625
///    [127:64] of the destination.
1626
/// \returns A 128-bit vector of [4 x float] containing the moved values.
1627
static __inline__ __m128 __DEFAULT_FN_ATTRS
1628
_mm_loadh_pi(__m128 __a, const __m64 *__p)
1629
{
1630
  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1631
  struct __mm_loadh_pi_struct {
1632
    __mm_loadh_pi_v2f32 __u;
1633
  } __attribute__((__packed__, __may_alias__));
1634
  __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1635
  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1636
  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1637
}
1638
 
1639
/// Loads two packed float values from the address \a __p into the
1640
///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1641
///    are copied from the high-order bits of the first operand.
1642
///
1643
/// \headerfile <x86intrin.h>
1644
///
1645
/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1646
///
1647
/// \param __a
1648
///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1649
///    [127:64] of the destination.
1650
/// \param __p
1651
///    A pointer to two packed float values. Bits [63:0] are written to bits
1652
///    [63:0] of the destination.
1653
/// \returns A 128-bit vector of [4 x float] containing the moved values.
1654
static __inline__ __m128 __DEFAULT_FN_ATTRS
1655
_mm_loadl_pi(__m128 __a, const __m64 *__p)
1656
{
1657
  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1658
  struct __mm_loadl_pi_struct {
1659
    __mm_loadl_pi_v2f32 __u;
1660
  } __attribute__((__packed__, __may_alias__));
1661
  __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1662
  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1663
  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1664
}
1665
 
1666
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1667
///    32 bits of the vector are initialized with the single-precision
1668
///    floating-point value loaded from a specified memory location. The upper
1669
///    96 bits are set to zero.
1670
///
1671
/// \headerfile <x86intrin.h>
1672
///
1673
/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1674
///
1675
/// \param __p
1676
///    A pointer to a 32-bit memory location containing a single-precision
1677
///    floating-point value.
1678
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1679
///    lower 32 bits contain the value loaded from the memory location. The
1680
///    upper 96 bits are set to zero.
1681
static __inline__ __m128 __DEFAULT_FN_ATTRS
1682
_mm_load_ss(const float *__p)
1683
{
1684
  struct __mm_load_ss_struct {
1685
    float __u;
1686
  } __attribute__((__packed__, __may_alias__));
1687
  float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1688
  return __extension__ (__m128){ __u, 0, 0, 0 };
1689
}
1690
 
1691
/// Loads a 32-bit float value and duplicates it to all four vector
1692
///    elements of a 128-bit vector of [4 x float].
1693
///
1694
/// \headerfile <x86intrin.h>
1695
///
1696
/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1697
///    instruction.
1698
///
1699
/// \param __p
1700
///    A pointer to a float value to be loaded and duplicated.
1701
/// \returns A 128-bit vector of [4 x float] containing the loaded and
1702
///    duplicated values.
1703
static __inline__ __m128 __DEFAULT_FN_ATTRS
1704
_mm_load1_ps(const float *__p)
1705
{
1706
  struct __mm_load1_ps_struct {
1707
    float __u;
1708
  } __attribute__((__packed__, __may_alias__));
1709
  float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1710
  return __extension__ (__m128){ __u, __u, __u, __u };
1711
}
1712
 
1713
#define        _mm_load_ps1(p) _mm_load1_ps(p)
1714
 
1715
/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1716
///    memory location.
1717
///
1718
/// \headerfile <x86intrin.h>
1719
///
1720
/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1721
///
1722
/// \param __p
1723
///    A pointer to a 128-bit memory location. The address of the memory
1724
///    location has to be 128-bit aligned.
1725
/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1726
static __inline__ __m128 __DEFAULT_FN_ATTRS
1727
_mm_load_ps(const float *__p)
1728
{
1729
  return *(const __m128*)__p;
1730
}
1731
 
1732
/// Loads a 128-bit floating-point vector of [4 x float] from an
1733
///    unaligned memory location.
1734
///
1735
/// \headerfile <x86intrin.h>
1736
///
1737
/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1738
///
1739
/// \param __p
1740
///    A pointer to a 128-bit memory location. The address of the memory
1741
///    location does not have to be aligned.
1742
/// \returns A 128-bit vector of [4 x float] containing the loaded values.
1743
static __inline__ __m128 __DEFAULT_FN_ATTRS
1744
_mm_loadu_ps(const float *__p)
1745
{
1746
  struct __loadu_ps {
1747
    __m128_u __v;
1748
  } __attribute__((__packed__, __may_alias__));
1749
  return ((const struct __loadu_ps*)__p)->__v;
1750
}
1751
 
1752
/// Loads four packed float values, in reverse order, from an aligned
1753
///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1754
///
1755
/// \headerfile <x86intrin.h>
1756
///
1757
/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1758
///    instruction.
1759
///
1760
/// \param __p
1761
///    A pointer to a 128-bit memory location. The address of the memory
1762
///    location has to be 128-bit aligned.
1763
/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1764
///    in reverse order.
1765
static __inline__ __m128 __DEFAULT_FN_ATTRS
1766
_mm_loadr_ps(const float *__p)
1767
{
1768
  __m128 __a = _mm_load_ps(__p);
1769
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1770
}
1771
 
1772
/// Create a 128-bit vector of [4 x float] with undefined values.
1773
///
1774
/// \headerfile <x86intrin.h>
1775
///
1776
/// This intrinsic has no corresponding instruction.
1777
///
1778
/// \returns A 128-bit vector of [4 x float] containing undefined values.
1779
static __inline__ __m128 __DEFAULT_FN_ATTRS
1780
_mm_undefined_ps(void)
1781
{
1782
  return (__m128)__builtin_ia32_undef128();
1783
}
1784
 
1785
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1786
///    32 bits of the vector are initialized with the specified single-precision
1787
///    floating-point value. The upper 96 bits are set to zero.
1788
///
1789
/// \headerfile <x86intrin.h>
1790
///
1791
/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1792
///
1793
/// \param __w
1794
///    A single-precision floating-point value used to initialize the lower 32
1795
///    bits of the result.
1796
/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1797
///    lower 32 bits contain the value provided in the source operand. The
1798
///    upper 96 bits are set to zero.
1799
static __inline__ __m128 __DEFAULT_FN_ATTRS
1800
_mm_set_ss(float __w)
1801
{
1802
  return __extension__ (__m128){ __w, 0, 0, 0 };
1803
}
1804
 
1805
/// Constructs a 128-bit floating-point vector of [4 x float], with each
1806
///    of the four single-precision floating-point vector elements set to the
1807
///    specified single-precision floating-point value.
1808
///
1809
/// \headerfile <x86intrin.h>
1810
///
1811
/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1812
///
1813
/// \param __w
1814
///    A single-precision floating-point value used to initialize each vector
1815
///    element of the result.
1816
/// \returns An initialized 128-bit floating-point vector of [4 x float].
1817
static __inline__ __m128 __DEFAULT_FN_ATTRS
1818
_mm_set1_ps(float __w)
1819
{
1820
  return __extension__ (__m128){ __w, __w, __w, __w };
1821
}
1822
 
1823
/* Microsoft specific. */
1824
/// Constructs a 128-bit floating-point vector of [4 x float], with each
1825
///    of the four single-precision floating-point vector elements set to the
1826
///    specified single-precision floating-point value.
1827
///
1828
/// \headerfile <x86intrin.h>
1829
///
1830
/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1831
///
1832
/// \param __w
1833
///    A single-precision floating-point value used to initialize each vector
1834
///    element of the result.
1835
/// \returns An initialized 128-bit floating-point vector of [4 x float].
1836
static __inline__ __m128 __DEFAULT_FN_ATTRS
1837
_mm_set_ps1(float __w)
1838
{
1839
    return _mm_set1_ps(__w);
1840
}
1841
 
1842
/// Constructs a 128-bit floating-point vector of [4 x float]
1843
///    initialized with the specified single-precision floating-point values.
1844
///
1845
/// \headerfile <x86intrin.h>
1846
///
1847
/// This intrinsic is a utility function and does not correspond to a specific
1848
///    instruction.
1849
///
1850
/// \param __z
1851
///    A single-precision floating-point value used to initialize bits [127:96]
1852
///    of the result.
1853
/// \param __y
1854
///    A single-precision floating-point value used to initialize bits [95:64]
1855
///    of the result.
1856
/// \param __x
1857
///    A single-precision floating-point value used to initialize bits [63:32]
1858
///    of the result.
1859
/// \param __w
1860
///    A single-precision floating-point value used to initialize bits [31:0]
1861
///    of the result.
1862
/// \returns An initialized 128-bit floating-point vector of [4 x float].
1863
static __inline__ __m128 __DEFAULT_FN_ATTRS
1864
_mm_set_ps(float __z, float __y, float __x, float __w)
1865
{
1866
  return __extension__ (__m128){ __w, __x, __y, __z };
1867
}
1868
 
1869
/// Constructs a 128-bit floating-point vector of [4 x float],
1870
///    initialized in reverse order with the specified 32-bit single-precision
1871
///    float-point values.
1872
///
1873
/// \headerfile <x86intrin.h>
1874
///
1875
/// This intrinsic is a utility function and does not correspond to a specific
1876
///    instruction.
1877
///
1878
/// \param __z
1879
///    A single-precision floating-point value used to initialize bits [31:0]
1880
///    of the result.
1881
/// \param __y
1882
///    A single-precision floating-point value used to initialize bits [63:32]
1883
///    of the result.
1884
/// \param __x
1885
///    A single-precision floating-point value used to initialize bits [95:64]
1886
///    of the result.
1887
/// \param __w
1888
///    A single-precision floating-point value used to initialize bits [127:96]
1889
///    of the result.
1890
/// \returns An initialized 128-bit floating-point vector of [4 x float].
1891
static __inline__ __m128 __DEFAULT_FN_ATTRS
1892
_mm_setr_ps(float __z, float __y, float __x, float __w)
1893
{
1894
  return __extension__ (__m128){ __z, __y, __x, __w };
1895
}
1896
 
1897
/// Constructs a 128-bit floating-point vector of [4 x float] initialized
1898
///    to zero.
1899
///
1900
/// \headerfile <x86intrin.h>
1901
///
1902
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1903
///
1904
/// \returns An initialized 128-bit floating-point vector of [4 x float] with
1905
///    all elements set to zero.
1906
static __inline__ __m128 __DEFAULT_FN_ATTRS
1907
_mm_setzero_ps(void)
1908
{
1909
  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
1910
}
1911
 
1912
/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1913
///    memory location.
1914
///
1915
/// \headerfile <x86intrin.h>
1916
///
1917
/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1918
///
1919
/// \param __p
1920
///    A pointer to a 64-bit memory location.
1921
/// \param __a
1922
///    A 128-bit vector of [4 x float] containing the values to be stored.
1923
static __inline__ void __DEFAULT_FN_ATTRS
1924
_mm_storeh_pi(__m64 *__p, __m128 __a)
1925
{
1926
  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1927
  struct __mm_storeh_pi_struct {
1928
    __mm_storeh_pi_v2f32 __u;
1929
  } __attribute__((__packed__, __may_alias__));
1930
  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1931
}
1932
 
1933
/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1934
///     memory location.
1935
///
1936
/// \headerfile <x86intrin.h>
1937
///
1938
/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1939
///
1940
/// \param __p
1941
///    A pointer to a memory location that will receive the float values.
1942
/// \param __a
1943
///    A 128-bit vector of [4 x float] containing the values to be stored.
1944
static __inline__ void __DEFAULT_FN_ATTRS
1945
_mm_storel_pi(__m64 *__p, __m128 __a)
1946
{
1947
  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1948
  struct __mm_storeh_pi_struct {
1949
    __mm_storeh_pi_v2f32 __u;
1950
  } __attribute__((__packed__, __may_alias__));
1951
  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1952
}
1953
 
1954
/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1955
///     memory location.
1956
///
1957
/// \headerfile <x86intrin.h>
1958
///
1959
/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1960
///
1961
/// \param __p
1962
///    A pointer to a 32-bit memory location.
1963
/// \param __a
1964
///    A 128-bit vector of [4 x float] containing the value to be stored.
1965
static __inline__ void __DEFAULT_FN_ATTRS
1966
_mm_store_ss(float *__p, __m128 __a)
1967
{
1968
  struct __mm_store_ss_struct {
1969
    float __u;
1970
  } __attribute__((__packed__, __may_alias__));
1971
  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1972
}
1973
 
1974
/// Stores a 128-bit vector of [4 x float] to an unaligned memory
1975
///    location.
1976
///
1977
/// \headerfile <x86intrin.h>
1978
///
1979
/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1980
///
1981
/// \param __p
1982
///    A pointer to a 128-bit memory location. The address of the memory
1983
///    location does not have to be aligned.
1984
/// \param __a
1985
///    A 128-bit vector of [4 x float] containing the values to be stored.
1986
static __inline__ void __DEFAULT_FN_ATTRS
1987
_mm_storeu_ps(float *__p, __m128 __a)
1988
{
1989
  struct __storeu_ps {
1990
    __m128_u __v;
1991
  } __attribute__((__packed__, __may_alias__));
1992
  ((struct __storeu_ps*)__p)->__v = __a;
1993
}
1994
 
1995
/// Stores a 128-bit vector of [4 x float] into an aligned memory
1996
///    location.
1997
///
1998
/// \headerfile <x86intrin.h>
1999
///
2000
/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2001
///
2002
/// \param __p
2003
///    A pointer to a 128-bit memory location. The address of the memory
2004
///    location has to be 16-byte aligned.
2005
/// \param __a
2006
///    A 128-bit vector of [4 x float] containing the values to be stored.
2007
static __inline__ void __DEFAULT_FN_ATTRS
2008
_mm_store_ps(float *__p, __m128 __a)
2009
{
2010
  *(__m128*)__p = __a;
2011
}
2012
 
2013
/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2014
///    four contiguous elements in an aligned memory location.
2015
///
2016
/// \headerfile <x86intrin.h>
2017
///
2018
/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2019
///    instruction.
2020
///
2021
/// \param __p
2022
///    A pointer to a 128-bit memory location.
2023
/// \param __a
2024
///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2025
///    of the four contiguous elements pointed by \a __p.
2026
static __inline__ void __DEFAULT_FN_ATTRS
2027
_mm_store1_ps(float *__p, __m128 __a)
2028
{
2029
  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2030
  _mm_store_ps(__p, __a);
2031
}
2032
 
2033
/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2034
///    four contiguous elements in an aligned memory location.
2035
///
2036
/// \headerfile <x86intrin.h>
2037
///
2038
/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2039
///    instruction.
2040
///
2041
/// \param __p
2042
///    A pointer to a 128-bit memory location.
2043
/// \param __a
2044
///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2045
///    of the four contiguous elements pointed by \a __p.
2046
static __inline__ void __DEFAULT_FN_ATTRS
2047
_mm_store_ps1(float *__p, __m128 __a)
2048
{
2049
  _mm_store1_ps(__p, __a);
2050
}
2051
 
2052
/// Stores float values from a 128-bit vector of [4 x float] to an
2053
///    aligned memory location in reverse order.
2054
///
2055
/// \headerfile <x86intrin.h>
2056
///
2057
/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2058
///    instruction.
2059
///
2060
/// \param __p
2061
///    A pointer to a 128-bit memory location. The address of the memory
2062
///    location has to be 128-bit aligned.
2063
/// \param __a
2064
///    A 128-bit vector of [4 x float] containing the values to be stored.
2065
static __inline__ void __DEFAULT_FN_ATTRS
2066
_mm_storer_ps(float *__p, __m128 __a)
2067
{
2068
  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2069
  _mm_store_ps(__p, __a);
2070
}
2071
 
2072
#define _MM_HINT_ET0 7
2073
#define _MM_HINT_ET1 6
2074
#define _MM_HINT_T0  3
2075
#define _MM_HINT_T1  2
2076
#define _MM_HINT_T2  1
2077
#define _MM_HINT_NTA 0
2078
 
2079
#ifndef _MSC_VER
2080
/* FIXME: We have to #define this because "sel" must be a constant integer, and
2081
   Sema doesn't do any form of constant propagation yet. */
2082
 
2083
/// Loads one cache line of data from the specified address to a location
2084
///    closer to the processor.
2085
///
2086
/// \headerfile <x86intrin.h>
2087
///
2088
/// \code
2089
/// void _mm_prefetch(const void *a, const int sel);
2090
/// \endcode
2091
///
2092
/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2093
///
2094
/// \param a
2095
///    A pointer to a memory location containing a cache line of data.
2096
/// \param sel
2097
///    A predefined integer constant specifying the type of prefetch
2098
///    operation: \n
2099
///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2100
///    PREFETCHNTA instruction will be generated. \n
2101
///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2102
///    be generated. \n
2103
///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2104
///    be generated. \n
2105
///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2106
///    be generated.
2107
#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2108
                                                 ((sel) >> 2) & 1, (sel) & 0x3))
2109
#endif
2110
 
2111
/// Stores a 64-bit integer in the specified aligned memory location. To
2112
///    minimize caching, the data is flagged as non-temporal (unlikely to be
2113
///    used again soon).
2114
///
2115
/// \headerfile <x86intrin.h>
2116
///
2117
/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2118
///
2119
/// \param __p
2120
///    A pointer to an aligned memory location used to store the register value.
2121
/// \param __a
2122
///    A 64-bit integer containing the value to be stored.
2123
static __inline__ void __DEFAULT_FN_ATTRS_MMX
2124
_mm_stream_pi(__m64 *__p, __m64 __a)
2125
{
2126
  __builtin_ia32_movntq(__p, __a);
2127
}
2128
 
2129
/// Moves packed float values from a 128-bit vector of [4 x float] to a
2130
///    128-bit aligned memory location. To minimize caching, the data is flagged
2131
///    as non-temporal (unlikely to be used again soon).
2132
///
2133
/// \headerfile <x86intrin.h>
2134
///
2135
/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2136
///
2137
/// \param __p
2138
///    A pointer to a 128-bit aligned memory location that will receive the
2139
///    single-precision floating-point values.
2140
/// \param __a
2141
///    A 128-bit vector of [4 x float] containing the values to be moved.
2142
static __inline__ void __DEFAULT_FN_ATTRS
2143
_mm_stream_ps(float *__p, __m128 __a)
2144
{
2145
  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2146
}
2147
 
2148
#if defined(__cplusplus)
2149
extern "C" {
2150
#endif
2151
 
2152
/// Forces strong memory ordering (serialization) between store
2153
///    instructions preceding this instruction and store instructions following
2154
///    this instruction, ensuring the system completes all previous stores
2155
///    before executing subsequent stores.
2156
///
2157
/// \headerfile <x86intrin.h>
2158
///
2159
/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2160
///
2161
void _mm_sfence(void);
2162
 
2163
#if defined(__cplusplus)
2164
} // extern "C"
2165
#endif
2166
 
2167
/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2168
///    returns it, as specified by the immediate integer operand.
2169
///
2170
/// \headerfile <x86intrin.h>
2171
///
2172
/// \code
2173
/// int _mm_extract_pi16(__m64 a, int n);
2174
/// \endcode
2175
///
2176
/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2177
///
2178
/// \param a
2179
///    A 64-bit vector of [4 x i16].
2180
/// \param n
2181
///    An immediate integer operand that determines which bits are extracted: \n
2182
///    0: Bits [15:0] are copied to the destination. \n
2183
///    1: Bits [31:16] are copied to the destination. \n
2184
///    2: Bits [47:32] are copied to the destination. \n
2185
///    3: Bits [63:48] are copied to the destination.
2186
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2187
#define _mm_extract_pi16(a, n) \
2188
  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2189
 
2190
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
2191
///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2192
///    specified by the immediate operand \a n.
2193
///
2194
/// \headerfile <x86intrin.h>
2195
///
2196
/// \code
2197
/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2198
/// \endcode
2199
///
2200
/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2201
///
2202
/// \param a
2203
///    A 64-bit vector of [4 x i16].
2204
/// \param d
2205
///    An integer. The lower 16-bit value from this operand is written to the
2206
///    destination at the offset specified by operand \a n.
2207
/// \param n
2208
///    An immediate integer operant that determines which the bits to be used
2209
///    in the destination. \n
2210
///    0: Bits [15:0] are copied to the destination. \n
2211
///    1: Bits [31:16] are copied to the destination. \n
2212
///    2: Bits [47:32] are copied to the destination. \n
2213
///    3: Bits [63:48] are copied to the destination.  \n
2214
///    The remaining bits in the destination are copied from the corresponding
2215
///    bits in operand \a a.
2216
/// \returns A 64-bit integer vector containing the copied packed data from the
2217
///    operands.
2218
#define _mm_insert_pi16(a, d, n) \
2219
  ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2220
 
2221
/// Compares each of the corresponding packed 16-bit integer values of
2222
///    the 64-bit integer vectors, and writes the greater value to the
2223
///    corresponding bits in the destination.
2224
///
2225
/// \headerfile <x86intrin.h>
2226
///
2227
/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2228
///
2229
/// \param __a
2230
///    A 64-bit integer vector containing one of the source operands.
2231
/// \param __b
2232
///    A 64-bit integer vector containing one of the source operands.
2233
/// \returns A 64-bit integer vector containing the comparison results.
2234
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2235
_mm_max_pi16(__m64 __a, __m64 __b)
2236
{
2237
  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2238
}
2239
 
2240
/// Compares each of the corresponding packed 8-bit unsigned integer
2241
///    values of the 64-bit integer vectors, and writes the greater value to the
2242
///    corresponding bits in the destination.
2243
///
2244
/// \headerfile <x86intrin.h>
2245
///
2246
/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2247
///
2248
/// \param __a
2249
///    A 64-bit integer vector containing one of the source operands.
2250
/// \param __b
2251
///    A 64-bit integer vector containing one of the source operands.
2252
/// \returns A 64-bit integer vector containing the comparison results.
2253
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2254
_mm_max_pu8(__m64 __a, __m64 __b)
2255
{
2256
  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2257
}
2258
 
2259
/// Compares each of the corresponding packed 16-bit integer values of
2260
///    the 64-bit integer vectors, and writes the lesser value to the
2261
///    corresponding bits in the destination.
2262
///
2263
/// \headerfile <x86intrin.h>
2264
///
2265
/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2266
///
2267
/// \param __a
2268
///    A 64-bit integer vector containing one of the source operands.
2269
/// \param __b
2270
///    A 64-bit integer vector containing one of the source operands.
2271
/// \returns A 64-bit integer vector containing the comparison results.
2272
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2273
_mm_min_pi16(__m64 __a, __m64 __b)
2274
{
2275
  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2276
}
2277
 
2278
/// Compares each of the corresponding packed 8-bit unsigned integer
2279
///    values of the 64-bit integer vectors, and writes the lesser value to the
2280
///    corresponding bits in the destination.
2281
///
2282
/// \headerfile <x86intrin.h>
2283
///
2284
/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2285
///
2286
/// \param __a
2287
///    A 64-bit integer vector containing one of the source operands.
2288
/// \param __b
2289
///    A 64-bit integer vector containing one of the source operands.
2290
/// \returns A 64-bit integer vector containing the comparison results.
2291
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2292
_mm_min_pu8(__m64 __a, __m64 __b)
2293
{
2294
  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2295
}
2296
 
2297
/// Takes the most significant bit from each 8-bit element in a 64-bit
2298
///    integer vector to create an 8-bit mask value. Zero-extends the value to
2299
///    32-bit integer and writes it to the destination.
2300
///
2301
/// \headerfile <x86intrin.h>
2302
///
2303
/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2304
///
2305
/// \param __a
2306
///    A 64-bit integer vector containing the values with bits to be extracted.
2307
/// \returns The most significant bit from each 8-bit element in \a __a,
2308
///    written to bits [7:0].
2309
static __inline__ int __DEFAULT_FN_ATTRS_MMX
2310
_mm_movemask_pi8(__m64 __a)
2311
{
2312
  return __builtin_ia32_pmovmskb((__v8qi)__a);
2313
}
2314
 
2315
/// Multiplies packed 16-bit unsigned integer values and writes the
2316
///    high-order 16 bits of each 32-bit product to the corresponding bits in
2317
///    the destination.
2318
///
2319
/// \headerfile <x86intrin.h>
2320
///
2321
/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2322
///
2323
/// \param __a
2324
///    A 64-bit integer vector containing one of the source operands.
2325
/// \param __b
2326
///    A 64-bit integer vector containing one of the source operands.
2327
/// \returns A 64-bit integer vector containing the products of both operands.
2328
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2329
_mm_mulhi_pu16(__m64 __a, __m64 __b)
2330
{
2331
  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2332
}
2333
 
2334
/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2335
///    destination, as specified by the immediate value operand.
2336
///
2337
/// \headerfile <x86intrin.h>
2338
///
2339
/// \code
2340
/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2341
/// \endcode
2342
///
2343
/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2344
///
2345
/// \param a
2346
///    A 64-bit integer vector containing the values to be shuffled.
2347
/// \param n
2348
///    An immediate value containing an 8-bit value specifying which elements to
2349
///    copy from \a a. The destinations within the 64-bit destination are
2350
///    assigned values as follows: \n
2351
///    Bits [1:0] are used to assign values to bits [15:0] in the
2352
///    destination. \n
2353
///    Bits [3:2] are used to assign values to bits [31:16] in the
2354
///    destination. \n
2355
///    Bits [5:4] are used to assign values to bits [47:32] in the
2356
///    destination. \n
2357
///    Bits [7:6] are used to assign values to bits [63:48] in the
2358
///    destination. \n
2359
///    Bit value assignments: \n
2360
///    00: assigned from bits [15:0] of \a a. \n
2361
///    01: assigned from bits [31:16] of \a a. \n
2362
///    10: assigned from bits [47:32] of \a a. \n
2363
///    11: assigned from bits [63:48] of \a a. \n
2364
///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2365
///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2366
///    <c>[b6, b4, b2, b0]</c>.
2367
/// \returns A 64-bit integer vector containing the shuffled values.
2368
#define _mm_shuffle_pi16(a, n) \
2369
  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2370
 
2371
/// Conditionally copies the values from each 8-bit element in the first
2372
///    64-bit integer vector operand to the specified memory location, as
2373
///    specified by the most significant bit in the corresponding element in the
2374
///    second 64-bit integer vector operand.
2375
///
2376
///    To minimize caching, the data is flagged as non-temporal
2377
///    (unlikely to be used again soon).
2378
///
2379
/// \headerfile <x86intrin.h>
2380
///
2381
/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2382
///
2383
/// \param __d
2384
///    A 64-bit integer vector containing the values with elements to be copied.
2385
/// \param __n
2386
///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2387
///    element determines whether the corresponding element in operand \a __d
2388
///    is copied. If the most significant bit of a given element is 1, the
2389
///    corresponding element in operand \a __d is copied.
2390
/// \param __p
2391
///    A pointer to a 64-bit memory location that will receive the conditionally
2392
///    copied integer values. The address of the memory location does not have
2393
///    to be aligned.
2394
static __inline__ void __DEFAULT_FN_ATTRS_MMX
2395
_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2396
{
2397
  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2398
}
2399
 
2400
/// Computes the rounded averages of the packed unsigned 8-bit integer
2401
///    values and writes the averages to the corresponding bits in the
2402
///    destination.
2403
///
2404
/// \headerfile <x86intrin.h>
2405
///
2406
/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2407
///
2408
/// \param __a
2409
///    A 64-bit integer vector containing one of the source operands.
2410
/// \param __b
2411
///    A 64-bit integer vector containing one of the source operands.
2412
/// \returns A 64-bit integer vector containing the averages of both operands.
2413
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2414
_mm_avg_pu8(__m64 __a, __m64 __b)
2415
{
2416
  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2417
}
2418
 
2419
/// Computes the rounded averages of the packed unsigned 16-bit integer
2420
///    values and writes the averages to the corresponding bits in the
2421
///    destination.
2422
///
2423
/// \headerfile <x86intrin.h>
2424
///
2425
/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2426
///
2427
/// \param __a
2428
///    A 64-bit integer vector containing one of the source operands.
2429
/// \param __b
2430
///    A 64-bit integer vector containing one of the source operands.
2431
/// \returns A 64-bit integer vector containing the averages of both operands.
2432
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2433
_mm_avg_pu16(__m64 __a, __m64 __b)
2434
{
2435
  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2436
}
2437
 
2438
/// Subtracts the corresponding 8-bit unsigned integer values of the two
2439
///    64-bit vector operands and computes the absolute value for each of the
2440
///    difference. Then sum of the 8 absolute differences is written to the
2441
///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2442
///
2443
/// \headerfile <x86intrin.h>
2444
///
2445
/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2446
///
2447
/// \param __a
2448
///    A 64-bit integer vector containing one of the source operands.
2449
/// \param __b
2450
///    A 64-bit integer vector containing one of the source operands.
2451
/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2452
///    sets of absolute differences between both operands. The upper bits are
2453
///    cleared.
2454
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2455
_mm_sad_pu8(__m64 __a, __m64 __b)
2456
{
2457
  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2458
}
2459
 
2460
#if defined(__cplusplus)
2461
extern "C" {
2462
#endif
2463
 
2464
/// Returns the contents of the MXCSR register as a 32-bit unsigned
2465
///    integer value.
2466
///
2467
///    There are several groups of macros associated with this
2468
///    intrinsic, including:
2469
///    <ul>
2470
///    <li>
2471
///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2472
///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2473
///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2474
///      _MM_GET_EXCEPTION_STATE().
2475
///    </li>
2476
///    <li>
2477
///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2478
///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2479
///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2480
///    </li>
2481
///    <li>
2482
///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2483
///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2484
///      _MM_GET_ROUNDING_MODE().
2485
///    </li>
2486
///    <li>
2487
///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2488
///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2489
///    </li>
2490
///    <li>
2491
///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2492
///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2493
///      _MM_GET_DENORMALS_ZERO_MODE().
2494
///    </li>
2495
///    </ul>
2496
///
2497
///    For example, the following expression checks if an overflow exception has
2498
///    occurred:
2499
///    \code
2500
///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2501
///    \endcode
2502
///
2503
///    The following expression gets the current rounding mode:
2504
///    \code
2505
///      _MM_GET_ROUNDING_MODE()
2506
///    \endcode
2507
///
2508
/// \headerfile <x86intrin.h>
2509
///
2510
/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2511
///
2512
/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2513
///    register.
2514
unsigned int _mm_getcsr(void);
2515
 
2516
/// Sets the MXCSR register with the 32-bit unsigned integer value.
2517
///
2518
///    There are several groups of macros associated with this intrinsic,
2519
///    including:
2520
///    <ul>
2521
///    <li>
2522
///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2523
///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2524
///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2525
///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2526
///    </li>
2527
///    <li>
2528
///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2529
///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2530
///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2531
///      of these macros.
2532
///    </li>
2533
///    <li>
2534
///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2535
///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2536
///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2537
///    </li>
2538
///    <li>
2539
///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2540
///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2541
///      one of these macros.
2542
///    </li>
2543
///    <li>
2544
///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2545
///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2546
///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2547
///    </li>
2548
///    </ul>
2549
///
2550
///    For example, the following expression causes subsequent floating-point
2551
///    operations to round up:
2552
///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2553
///
2554
///    The following example sets the DAZ and FTZ flags:
2555
///    \code
2556
///    void setFlags() {
2557
///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2558
///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2559
///    }
2560
///    \endcode
2561
///
2562
/// \headerfile <x86intrin.h>
2563
///
2564
/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2565
///
2566
/// \param __i
2567
///    A 32-bit unsigned integer value to be written to the MXCSR register.
2568
void _mm_setcsr(unsigned int __i);
2569
 
2570
#if defined(__cplusplus)
2571
} // extern "C"
2572
#endif
2573
 
2574
/// Selects 4 float values from the 128-bit operands of [4 x float], as
2575
///    specified by the immediate value operand.
2576
///
2577
/// \headerfile <x86intrin.h>
2578
///
2579
/// \code
2580
/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2581
/// \endcode
2582
///
2583
/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2584
///
2585
/// \param a
2586
///    A 128-bit vector of [4 x float].
2587
/// \param b
2588
///    A 128-bit vector of [4 x float].
2589
/// \param mask
2590
///    An immediate value containing an 8-bit value specifying which elements to
2591
///    copy from \a a and \a b. \n
2592
///    Bits [3:0] specify the values copied from operand \a a. \n
2593
///    Bits [7:4] specify the values copied from operand \a b. \n
2594
///    The destinations within the 128-bit destination are assigned values as
2595
///    follows: \n
2596
///    Bits [1:0] are used to assign values to bits [31:0] in the
2597
///    destination. \n
2598
///    Bits [3:2] are used to assign values to bits [63:32] in the
2599
///    destination. \n
2600
///    Bits [5:4] are used to assign values to bits [95:64] in the
2601
///    destination. \n
2602
///    Bits [7:6] are used to assign values to bits [127:96] in the
2603
///    destination. \n
2604
///    Bit value assignments: \n
2605
///    00: Bits [31:0] copied from the specified operand. \n
2606
///    01: Bits [63:32] copied from the specified operand. \n
2607
///    10: Bits [95:64] copied from the specified operand. \n
2608
///    11: Bits [127:96] copied from the specified operand. \n
2609
///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2610
///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2611
///    <c>[b6, b4, b2, b0]</c>.
2612
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2613
#define _mm_shuffle_ps(a, b, mask) \
2614
  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2615
                                 (int)(mask)))
2616
 
2617
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2618
///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2619
///
2620
/// \headerfile <x86intrin.h>
2621
///
2622
/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2623
///
2624
/// \param __a
2625
///    A 128-bit vector of [4 x float]. \n
2626
///    Bits [95:64] are written to bits [31:0] of the destination. \n
2627
///    Bits [127:96] are written to bits [95:64] of the destination.
2628
/// \param __b
2629
///    A 128-bit vector of [4 x float].
2630
///    Bits [95:64] are written to bits [63:32] of the destination. \n
2631
///    Bits [127:96] are written to bits [127:96] of the destination.
2632
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2633
static __inline__ __m128 __DEFAULT_FN_ATTRS
2634
_mm_unpackhi_ps(__m128 __a, __m128 __b)
2635
{
2636
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2637
}
2638
 
2639
/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2640
///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2641
///
2642
/// \headerfile <x86intrin.h>
2643
///
2644
/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2645
///
2646
/// \param __a
2647
///    A 128-bit vector of [4 x float]. \n
2648
///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2649
///    Bits [63:32] are written to bits [95:64] of the destination.
2650
/// \param __b
2651
///    A 128-bit vector of [4 x float]. \n
2652
///    Bits [31:0] are written to bits [63:32] of the destination. \n
2653
///    Bits [63:32] are written to bits [127:96] of the destination.
2654
/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2655
static __inline__ __m128 __DEFAULT_FN_ATTRS
2656
_mm_unpacklo_ps(__m128 __a, __m128 __b)
2657
{
2658
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2659
}
2660
 
2661
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2662
///    32 bits are set to the lower 32 bits of the second parameter. The upper
2663
///    96 bits are set to the upper 96 bits of the first parameter.
2664
///
2665
/// \headerfile <x86intrin.h>
2666
///
2667
/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2668
///    instruction.
2669
///
2670
/// \param __a
2671
///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2672
///    written to the upper 96 bits of the result.
2673
/// \param __b
2674
///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2675
///    written to the lower 32 bits of the result.
2676
/// \returns A 128-bit floating-point vector of [4 x float].
2677
static __inline__ __m128 __DEFAULT_FN_ATTRS
2678
_mm_move_ss(__m128 __a, __m128 __b)
2679
{
2680
  __a[0] = __b[0];
2681
  return __a;
2682
}
2683
 
2684
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2685
///    64 bits are set to the upper 64 bits of the second parameter. The upper
2686
///    64 bits are set to the upper 64 bits of the first parameter.
2687
///
2688
/// \headerfile <x86intrin.h>
2689
///
2690
/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2691
///
2692
/// \param __a
2693
///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2694
///    written to the upper 64 bits of the result.
2695
/// \param __b
2696
///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2697
///    written to the lower 64 bits of the result.
2698
/// \returns A 128-bit floating-point vector of [4 x float].
2699
static __inline__ __m128 __DEFAULT_FN_ATTRS
2700
_mm_movehl_ps(__m128 __a, __m128 __b)
2701
{
2702
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2703
}
2704
 
2705
/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2706
///    64 bits are set to the lower 64 bits of the first parameter. The upper
2707
///    64 bits are set to the lower 64 bits of the second parameter.
2708
///
2709
/// \headerfile <x86intrin.h>
2710
///
2711
/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2712
///
2713
/// \param __a
2714
///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2715
///    written to the lower 64 bits of the result.
2716
/// \param __b
2717
///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2718
///    written to the upper 64 bits of the result.
2719
/// \returns A 128-bit floating-point vector of [4 x float].
2720
static __inline__ __m128 __DEFAULT_FN_ATTRS
2721
_mm_movelh_ps(__m128 __a, __m128 __b)
2722
{
2723
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2724
}
2725
 
2726
/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2727
///    float].
2728
///
2729
/// \headerfile <x86intrin.h>
2730
///
2731
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2732
///
2733
/// \param __a
2734
///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2735
///    from the corresponding elements in this operand.
2736
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2737
///    values from the operand.
2738
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2739
_mm_cvtpi16_ps(__m64 __a)
2740
{
2741
  __m64 __b, __c;
2742
  __m128 __r;
2743
 
2744
  __b = _mm_setzero_si64();
2745
  __b = _mm_cmpgt_pi16(__b, __a);
2746
  __c = _mm_unpackhi_pi16(__a, __b);
2747
  __r = _mm_setzero_ps();
2748
  __r = _mm_cvtpi32_ps(__r, __c);
2749
  __r = _mm_movelh_ps(__r, __r);
2750
  __c = _mm_unpacklo_pi16(__a, __b);
2751
  __r = _mm_cvtpi32_ps(__r, __c);
2752
 
2753
  return __r;
2754
}
2755
 
2756
/// Converts a 64-bit vector of 16-bit unsigned integer values into a
2757
///    128-bit vector of [4 x float].
2758
///
2759
/// \headerfile <x86intrin.h>
2760
///
2761
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2762
///
2763
/// \param __a
2764
///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2765
///    destination are copied from the corresponding elements in this operand.
2766
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2767
///    values from the operand.
2768
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2769
_mm_cvtpu16_ps(__m64 __a)
2770
{
2771
  __m64 __b, __c;
2772
  __m128 __r;
2773
 
2774
  __b = _mm_setzero_si64();
2775
  __c = _mm_unpackhi_pi16(__a, __b);
2776
  __r = _mm_setzero_ps();
2777
  __r = _mm_cvtpi32_ps(__r, __c);
2778
  __r = _mm_movelh_ps(__r, __r);
2779
  __c = _mm_unpacklo_pi16(__a, __b);
2780
  __r = _mm_cvtpi32_ps(__r, __c);
2781
 
2782
  return __r;
2783
}
2784
 
2785
/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2786
///    into a 128-bit vector of [4 x float].
2787
///
2788
/// \headerfile <x86intrin.h>
2789
///
2790
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2791
///
2792
/// \param __a
2793
///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2794
///    from the corresponding lower 4 elements in this operand.
2795
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2796
///    values from the operand.
2797
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2798
_mm_cvtpi8_ps(__m64 __a)
2799
{
2800
  __m64 __b;
2801
 
2802
  __b = _mm_setzero_si64();
2803
  __b = _mm_cmpgt_pi8(__b, __a);
2804
  __b = _mm_unpacklo_pi8(__a, __b);
2805
 
2806
  return _mm_cvtpi16_ps(__b);
2807
}
2808
 
2809
/// Converts the lower four unsigned 8-bit integer values from a 64-bit
2810
///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2811
///
2812
/// \headerfile <x86intrin.h>
2813
///
2814
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2815
///
2816
/// \param __a
2817
///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2818
///    destination are copied from the corresponding lower 4 elements in this
2819
///    operand.
2820
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
2821
///    values from the source operand.
2822
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2823
_mm_cvtpu8_ps(__m64 __a)
2824
{
2825
  __m64 __b;
2826
 
2827
  __b = _mm_setzero_si64();
2828
  __b = _mm_unpacklo_pi8(__a, __b);
2829
 
2830
  return _mm_cvtpi16_ps(__b);
2831
}
2832
 
2833
/// Converts the two 32-bit signed integer values from each 64-bit vector
2834
///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2835
///
2836
/// \headerfile <x86intrin.h>
2837
///
2838
/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2839
///
2840
/// \param __a
2841
///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2842
///    copied from the elements in this operand.
2843
/// \param __b
2844
///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2845
///    copied from the elements in this operand.
2846
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2847
///    copied and converted values from the first operand. The upper 64 bits
2848
///    contain the copied and converted values from the second operand.
2849
static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2850
_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2851
{
2852
  __m128 __c;
2853
 
2854
  __c = _mm_setzero_ps();
2855
  __c = _mm_cvtpi32_ps(__c, __b);
2856
  __c = _mm_movelh_ps(__c, __c);
2857
 
2858
  return _mm_cvtpi32_ps(__c, __a);
2859
}
2860
 
2861
/// Converts each single-precision floating-point element of a 128-bit
2862
///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2863
///    packs the results into a 64-bit integer vector of [4 x i16].
2864
///
2865
///    If the floating-point element is NaN or infinity, or if the
2866
///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2867
///    it is converted to 0x8000. Otherwise if the floating-point element is
2868
///    greater than 0x7FFF, it is converted to 0x7FFF.
2869
///
2870
/// \headerfile <x86intrin.h>
2871
///
2872
/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2873
///
2874
/// \param __a
2875
///    A 128-bit floating-point vector of [4 x float].
2876
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
2877
///    values.
2878
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2879
_mm_cvtps_pi16(__m128 __a)
2880
{
2881
  __m64 __b, __c;
2882
 
2883
  __b = _mm_cvtps_pi32(__a);
2884
  __a = _mm_movehl_ps(__a, __a);
2885
  __c = _mm_cvtps_pi32(__a);
2886
 
2887
  return _mm_packs_pi32(__b, __c);
2888
}
2889
 
2890
/// Converts each single-precision floating-point element of a 128-bit
2891
///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2892
///    packs the results into the lower 32 bits of a 64-bit integer vector of
2893
///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2894
///
2895
///    If the floating-point element is NaN or infinity, or if the
2896
///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2897
///    is converted to 0x80. Otherwise if the floating-point element is greater
2898
///    than 0x7F, it is converted to 0x7F.
2899
///
2900
/// \headerfile <x86intrin.h>
2901
///
2902
/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2903
///
2904
/// \param __a
2905
///    128-bit floating-point vector of [4 x float].
2906
/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2907
///    converted values and the uppper 32 bits are set to zero.
2908
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2909
_mm_cvtps_pi8(__m128 __a)
2910
{
2911
  __m64 __b, __c;
2912
 
2913
  __b = _mm_cvtps_pi16(__a);
2914
  __c = _mm_setzero_si64();
2915
 
2916
  return _mm_packs_pi16(__b, __c);
2917
}
2918
 
2919
/// Extracts the sign bits from each single-precision floating-point
2920
///    element of a 128-bit floating-point vector of [4 x float] and returns the
2921
///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2922
///    to zero.
2923
///
2924
/// \headerfile <x86intrin.h>
2925
///
2926
/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2927
///
2928
/// \param __a
2929
///    A 128-bit floating-point vector of [4 x float].
2930
/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2931
///    single-precision floating-point element of the parameter. Bits [31:4] are
2932
///    set to zero.
2933
static __inline__ int __DEFAULT_FN_ATTRS
2934
_mm_movemask_ps(__m128 __a)
2935
{
2936
  return __builtin_ia32_movmskps((__v4sf)__a);
2937
}
2938
 
2939
 
2940
#define _MM_ALIGN16 __attribute__((aligned(16)))
2941
 
2942
#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2943
 
2944
#define _MM_EXCEPT_INVALID    (0x0001U)
2945
#define _MM_EXCEPT_DENORM     (0x0002U)
2946
#define _MM_EXCEPT_DIV_ZERO   (0x0004U)
2947
#define _MM_EXCEPT_OVERFLOW   (0x0008U)
2948
#define _MM_EXCEPT_UNDERFLOW  (0x0010U)
2949
#define _MM_EXCEPT_INEXACT    (0x0020U)
2950
#define _MM_EXCEPT_MASK       (0x003fU)
2951
 
2952
#define _MM_MASK_INVALID      (0x0080U)
2953
#define _MM_MASK_DENORM       (0x0100U)
2954
#define _MM_MASK_DIV_ZERO     (0x0200U)
2955
#define _MM_MASK_OVERFLOW     (0x0400U)
2956
#define _MM_MASK_UNDERFLOW    (0x0800U)
2957
#define _MM_MASK_INEXACT      (0x1000U)
2958
#define _MM_MASK_MASK         (0x1f80U)
2959
 
2960
#define _MM_ROUND_NEAREST     (0x0000U)
2961
#define _MM_ROUND_DOWN        (0x2000U)
2962
#define _MM_ROUND_UP          (0x4000U)
2963
#define _MM_ROUND_TOWARD_ZERO (0x6000U)
2964
#define _MM_ROUND_MASK        (0x6000U)
2965
 
2966
#define _MM_FLUSH_ZERO_MASK   (0x8000U)
2967
#define _MM_FLUSH_ZERO_ON     (0x8000U)
2968
#define _MM_FLUSH_ZERO_OFF    (0x0000U)
2969
 
2970
#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2971
#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2972
#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2973
#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2974
 
2975
#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2976
#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2977
#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2978
#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2979
 
2980
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2981
do { \
2982
  __m128 tmp3, tmp2, tmp1, tmp0; \
2983
  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2984
  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2985
  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2986
  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2987
  (row0) = _mm_movelh_ps(tmp0, tmp2); \
2988
  (row1) = _mm_movehl_ps(tmp2, tmp0); \
2989
  (row2) = _mm_movelh_ps(tmp1, tmp3); \
2990
  (row3) = _mm_movehl_ps(tmp3, tmp1); \
2991
} while (0)
2992
 
2993
/* Aliases for compatibility. */
2994
#define _m_pextrw _mm_extract_pi16
2995
#define _m_pinsrw _mm_insert_pi16
2996
#define _m_pmaxsw _mm_max_pi16
2997
#define _m_pmaxub _mm_max_pu8
2998
#define _m_pminsw _mm_min_pi16
2999
#define _m_pminub _mm_min_pu8
3000
#define _m_pmovmskb _mm_movemask_pi8
3001
#define _m_pmulhuw _mm_mulhi_pu16
3002
#define _m_pshufw _mm_shuffle_pi16
3003
#define _m_maskmovq _mm_maskmove_si64
3004
#define _m_pavgb _mm_avg_pu8
3005
#define _m_pavgw _mm_avg_pu16
3006
#define _m_psadbw _mm_sad_pu8
3007
#define _m_ _mm_
3008
 
3009
#undef __DEFAULT_FN_ATTRS
3010
#undef __DEFAULT_FN_ATTRS_MMX
3011
 
3012
/* Ugly hack for backwards-compatibility (compatible with gcc) */
3013
#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3014
#include <emmintrin.h>
3015
#endif
3016
 
3017
#endif /* __XMMINTRIN_H */