Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
#ifndef __MMINTRIN_H
11
#define __MMINTRIN_H
12
 
13
#if !defined(__i386__) && !defined(__x86_64__)
14
#error "This header is only meant to be used on x86 and x64 architecture"
15
#endif
16
 
17
typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
18
 
19
typedef long long __v1di __attribute__((__vector_size__(8)));
20
typedef int __v2si __attribute__((__vector_size__(8)));
21
typedef short __v4hi __attribute__((__vector_size__(8)));
22
typedef char __v8qi __attribute__((__vector_size__(8)));
23
 
24
/* Define the default attributes for the functions in this file. */
25
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
26
 
27
/// Clears the MMX state by setting the state of the x87 stack registers
28
///    to empty.
29
///
30
/// \headerfile <x86intrin.h>
31
///
32
/// This intrinsic corresponds to the <c> EMMS </c> instruction.
33
///
34
static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
35
_mm_empty(void)
36
{
37
    __builtin_ia32_emms();
38
}
39
 
40
/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
41
///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
42
///
43
/// \headerfile <x86intrin.h>
44
///
45
/// This intrinsic corresponds to the <c> MOVD </c> instruction.
46
///
47
/// \param __i
48
///    A 32-bit integer value.
49
/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
50
///    parameter. The upper 32 bits are set to 0.
51
static __inline__ __m64 __DEFAULT_FN_ATTRS
52
_mm_cvtsi32_si64(int __i)
53
{
54
    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
55
}
56
 
57
/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
58
///    signed integer.
59
///
60
/// \headerfile <x86intrin.h>
61
///
62
/// This intrinsic corresponds to the <c> MOVD </c> instruction.
63
///
64
/// \param __m
65
///    A 64-bit integer vector.
66
/// \returns A 32-bit signed integer value containing the lower 32 bits of the
67
///    parameter.
68
static __inline__ int __DEFAULT_FN_ATTRS
69
_mm_cvtsi64_si32(__m64 __m)
70
{
71
    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
72
}
73
 
74
/// Casts a 64-bit signed integer value into a 64-bit integer vector.
75
///
76
/// \headerfile <x86intrin.h>
77
///
78
/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
79
///
80
/// \param __i
81
///    A 64-bit signed integer.
82
/// \returns A 64-bit integer vector containing the same bitwise pattern as the
83
///    parameter.
84
static __inline__ __m64 __DEFAULT_FN_ATTRS
85
_mm_cvtsi64_m64(long long __i)
86
{
87
    return (__m64)__i;
88
}
89
 
90
/// Casts a 64-bit integer vector into a 64-bit signed integer value.
91
///
92
/// \headerfile <x86intrin.h>
93
///
94
/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
95
///
96
/// \param __m
97
///    A 64-bit integer vector.
98
/// \returns A 64-bit signed integer containing the same bitwise pattern as the
99
///    parameter.
100
static __inline__ long long __DEFAULT_FN_ATTRS
101
_mm_cvtm64_si64(__m64 __m)
102
{
103
    return (long long)__m;
104
}
105
 
106
/// Converts 16-bit signed integers from both 64-bit integer vector
107
///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
108
///    a 64-bit integer vector of [8 x i8] as the result. Positive values
109
///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
110
///    are saturated to 0x80.
111
///
112
/// \headerfile <x86intrin.h>
113
///
114
/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
115
///
116
/// \param __m1
117
///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
118
///    16-bit signed integer and is converted to an 8-bit signed integer with
119
///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
120
///    Negative values less than 0x80 are saturated to 0x80. The converted
121
///    [4 x i8] values are written to the lower 32 bits of the result.
122
/// \param __m2
123
///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
124
///    16-bit signed integer and is converted to an 8-bit signed integer with
125
///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
126
///    Negative values less than 0x80 are saturated to 0x80. The converted
127
///    [4 x i8] values are written to the upper 32 bits of the result.
128
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
129
///    values.
130
static __inline__ __m64 __DEFAULT_FN_ATTRS
131
_mm_packs_pi16(__m64 __m1, __m64 __m2)
132
{
133
    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
134
}
135
 
136
/// Converts 32-bit signed integers from both 64-bit integer vector
137
///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
138
///    a 64-bit integer vector of [4 x i16] as the result. Positive values
139
///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
140
///    0x8000 are saturated to 0x8000.
141
///
142
/// \headerfile <x86intrin.h>
143
///
144
/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
145
///
146
/// \param __m1
147
///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
148
///    32-bit signed integer and is converted to a 16-bit signed integer with
149
///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
150
///    Negative values less than 0x8000 are saturated to 0x8000. The converted
151
///    [2 x i16] values are written to the lower 32 bits of the result.
152
/// \param __m2
153
///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
154
///    32-bit signed integer and is converted to a 16-bit signed integer with
155
///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
156
///    Negative values less than 0x8000 are saturated to 0x8000. The converted
157
///    [2 x i16] values are written to the upper 32 bits of the result.
158
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
159
///    values.
160
static __inline__ __m64 __DEFAULT_FN_ATTRS
161
_mm_packs_pi32(__m64 __m1, __m64 __m2)
162
{
163
    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
164
}
165
 
166
/// Converts 16-bit signed integers from both 64-bit integer vector
167
///    parameters of [4 x i16] into 8-bit unsigned integer values, and
168
///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
169
///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
170
///    to 0.
171
///
172
/// \headerfile <x86intrin.h>
173
///
174
/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
175
///
176
/// \param __m1
177
///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
178
///    16-bit signed integer and is converted to an 8-bit unsigned integer with
179
///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
180
///    than 0 are saturated to 0. The converted [4 x i8] values are written to
181
///    the lower 32 bits of the result.
182
/// \param __m2
183
///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
184
///    16-bit signed integer and is converted to an 8-bit unsigned integer with
185
///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
186
///    than 0 are saturated to 0. The converted [4 x i8] values are written to
187
///    the upper 32 bits of the result.
188
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
189
///    values.
190
static __inline__ __m64 __DEFAULT_FN_ATTRS
191
_mm_packs_pu16(__m64 __m1, __m64 __m2)
192
{
193
    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
194
}
195
 
196
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
197
///    and interleaves them into a 64-bit integer vector of [8 x i8].
198
///
199
/// \headerfile <x86intrin.h>
200
///
201
/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
202
///
203
/// \param __m1
204
///    A 64-bit integer vector of [8 x i8]. \n
205
///    Bits [39:32] are written to bits [7:0] of the result. \n
206
///    Bits [47:40] are written to bits [23:16] of the result. \n
207
///    Bits [55:48] are written to bits [39:32] of the result. \n
208
///    Bits [63:56] are written to bits [55:48] of the result.
209
/// \param __m2
210
///    A 64-bit integer vector of [8 x i8].
211
///    Bits [39:32] are written to bits [15:8] of the result. \n
212
///    Bits [47:40] are written to bits [31:24] of the result. \n
213
///    Bits [55:48] are written to bits [47:40] of the result. \n
214
///    Bits [63:56] are written to bits [63:56] of the result.
215
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
216
///    values.
217
static __inline__ __m64 __DEFAULT_FN_ATTRS
218
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
219
{
220
    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
221
}
222
 
223
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
224
///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
225
///
226
/// \headerfile <x86intrin.h>
227
///
228
/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
229
///
230
/// \param __m1
231
///    A 64-bit integer vector of [4 x i16].
232
///    Bits [47:32] are written to bits [15:0] of the result. \n
233
///    Bits [63:48] are written to bits [47:32] of the result.
234
/// \param __m2
235
///    A 64-bit integer vector of [4 x i16].
236
///    Bits [47:32] are written to bits [31:16] of the result. \n
237
///    Bits [63:48] are written to bits [63:48] of the result.
238
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
239
///    values.
240
static __inline__ __m64 __DEFAULT_FN_ATTRS
241
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
242
{
243
    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
244
}
245
 
246
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
247
///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
248
///
249
/// \headerfile <x86intrin.h>
250
///
251
/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
252
///
253
/// \param __m1
254
///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
255
///    the lower 32 bits of the result.
256
/// \param __m2
257
///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
258
///    the upper 32 bits of the result.
259
/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
260
///    values.
261
static __inline__ __m64 __DEFAULT_FN_ATTRS
262
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
263
{
264
    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
265
}
266
 
267
/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
268
///    and interleaves them into a 64-bit integer vector of [8 x i8].
269
///
270
/// \headerfile <x86intrin.h>
271
///
272
/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
273
///
274
/// \param __m1
275
///    A 64-bit integer vector of [8 x i8].
276
///    Bits [7:0] are written to bits [7:0] of the result. \n
277
///    Bits [15:8] are written to bits [23:16] of the result. \n
278
///    Bits [23:16] are written to bits [39:32] of the result. \n
279
///    Bits [31:24] are written to bits [55:48] of the result.
280
/// \param __m2
281
///    A 64-bit integer vector of [8 x i8].
282
///    Bits [7:0] are written to bits [15:8] of the result. \n
283
///    Bits [15:8] are written to bits [31:24] of the result. \n
284
///    Bits [23:16] are written to bits [47:40] of the result. \n
285
///    Bits [31:24] are written to bits [63:56] of the result.
286
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
287
///    values.
288
static __inline__ __m64 __DEFAULT_FN_ATTRS
289
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
290
{
291
    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
292
}
293
 
294
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
295
///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
296
///
297
/// \headerfile <x86intrin.h>
298
///
299
/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
300
///
301
/// \param __m1
302
///    A 64-bit integer vector of [4 x i16].
303
///    Bits [15:0] are written to bits [15:0] of the result. \n
304
///    Bits [31:16] are written to bits [47:32] of the result.
305
/// \param __m2
306
///    A 64-bit integer vector of [4 x i16].
307
///    Bits [15:0] are written to bits [31:16] of the result. \n
308
///    Bits [31:16] are written to bits [63:48] of the result.
309
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
310
///    values.
311
static __inline__ __m64 __DEFAULT_FN_ATTRS
312
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
313
{
314
    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
315
}
316
 
317
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
318
///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
319
///
320
/// \headerfile <x86intrin.h>
321
///
322
/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
323
///
324
/// \param __m1
325
///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
326
///    the lower 32 bits of the result.
327
/// \param __m2
328
///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
329
///    the upper 32 bits of the result.
330
/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
331
///    values.
332
static __inline__ __m64 __DEFAULT_FN_ATTRS
333
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
334
{
335
    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
336
}
337
 
338
/// Adds each 8-bit integer element of the first 64-bit integer vector
339
///    of [8 x i8] to the corresponding 8-bit integer element of the second
340
///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
341
///    packed into a 64-bit integer vector of [8 x i8].
342
///
343
/// \headerfile <x86intrin.h>
344
///
345
/// This intrinsic corresponds to the <c> PADDB </c> instruction.
346
///
347
/// \param __m1
348
///    A 64-bit integer vector of [8 x i8].
349
/// \param __m2
350
///    A 64-bit integer vector of [8 x i8].
351
/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
352
///    parameters.
353
static __inline__ __m64 __DEFAULT_FN_ATTRS
354
_mm_add_pi8(__m64 __m1, __m64 __m2)
355
{
356
    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
357
}
358
 
359
/// Adds each 16-bit integer element of the first 64-bit integer vector
360
///    of [4 x i16] to the corresponding 16-bit integer element of the second
361
///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
362
///    packed into a 64-bit integer vector of [4 x i16].
363
///
364
/// \headerfile <x86intrin.h>
365
///
366
/// This intrinsic corresponds to the <c> PADDW </c> instruction.
367
///
368
/// \param __m1
369
///    A 64-bit integer vector of [4 x i16].
370
/// \param __m2
371
///    A 64-bit integer vector of [4 x i16].
372
/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
373
///    parameters.
374
static __inline__ __m64 __DEFAULT_FN_ATTRS
375
_mm_add_pi16(__m64 __m1, __m64 __m2)
376
{
377
    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
378
}
379
 
380
/// Adds each 32-bit integer element of the first 64-bit integer vector
381
///    of [2 x i32] to the corresponding 32-bit integer element of the second
382
///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
383
///    packed into a 64-bit integer vector of [2 x i32].
384
///
385
/// \headerfile <x86intrin.h>
386
///
387
/// This intrinsic corresponds to the <c> PADDD </c> instruction.
388
///
389
/// \param __m1
390
///    A 64-bit integer vector of [2 x i32].
391
/// \param __m2
392
///    A 64-bit integer vector of [2 x i32].
393
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
394
///    parameters.
395
static __inline__ __m64 __DEFAULT_FN_ATTRS
396
_mm_add_pi32(__m64 __m1, __m64 __m2)
397
{
398
    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
399
}
400
 
401
/// Adds each 8-bit signed integer element of the first 64-bit integer
402
///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
403
///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
404
///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
405
///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
406
///
407
/// \headerfile <x86intrin.h>
408
///
409
/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
410
///
411
/// \param __m1
412
///    A 64-bit integer vector of [8 x i8].
413
/// \param __m2
414
///    A 64-bit integer vector of [8 x i8].
415
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
416
///    of both parameters.
417
static __inline__ __m64 __DEFAULT_FN_ATTRS
418
_mm_adds_pi8(__m64 __m1, __m64 __m2)
419
{
420
    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
421
}
422
 
423
/// Adds each 16-bit signed integer element of the first 64-bit integer
424
///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
425
///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
426
///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
427
///    saturated to 0x8000. The results are packed into a 64-bit integer vector
428
///    of [4 x i16].
429
///
430
/// \headerfile <x86intrin.h>
431
///
432
/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
433
///
434
/// \param __m1
435
///    A 64-bit integer vector of [4 x i16].
436
/// \param __m2
437
///    A 64-bit integer vector of [4 x i16].
438
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
439
///    of both parameters.
440
static __inline__ __m64 __DEFAULT_FN_ATTRS
441
_mm_adds_pi16(__m64 __m1, __m64 __m2)
442
{
443
    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
444
}
445
 
446
/// Adds each 8-bit unsigned integer element of the first 64-bit integer
447
///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
448
///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
449
///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
450
///    [8 x i8].
451
///
452
/// \headerfile <x86intrin.h>
453
///
454
/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
455
///
456
/// \param __m1
457
///    A 64-bit integer vector of [8 x i8].
458
/// \param __m2
459
///    A 64-bit integer vector of [8 x i8].
460
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
461
///    unsigned sums of both parameters.
462
static __inline__ __m64 __DEFAULT_FN_ATTRS
463
_mm_adds_pu8(__m64 __m1, __m64 __m2)
464
{
465
    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
466
}
467
 
468
/// Adds each 16-bit unsigned integer element of the first 64-bit integer
469
///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
470
///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
471
///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
472
///    integer vector of [4 x i16].
473
///
474
/// \headerfile <x86intrin.h>
475
///
476
/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
477
///
478
/// \param __m1
479
///    A 64-bit integer vector of [4 x i16].
480
/// \param __m2
481
///    A 64-bit integer vector of [4 x i16].
482
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
483
///    unsigned sums of both parameters.
484
static __inline__ __m64 __DEFAULT_FN_ATTRS
485
_mm_adds_pu16(__m64 __m1, __m64 __m2)
486
{
487
    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
488
}
489
 
490
/// Subtracts each 8-bit integer element of the second 64-bit integer
491
///    vector of [8 x i8] from the corresponding 8-bit integer element of the
492
///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
493
///    are packed into a 64-bit integer vector of [8 x i8].
494
///
495
/// \headerfile <x86intrin.h>
496
///
497
/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
498
///
499
/// \param __m1
500
///    A 64-bit integer vector of [8 x i8] containing the minuends.
501
/// \param __m2
502
///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
503
/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
504
///    both parameters.
505
static __inline__ __m64 __DEFAULT_FN_ATTRS
506
_mm_sub_pi8(__m64 __m1, __m64 __m2)
507
{
508
    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
509
}
510
 
511
/// Subtracts each 16-bit integer element of the second 64-bit integer
512
///    vector of [4 x i16] from the corresponding 16-bit integer element of the
513
///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
514
///    results are packed into a 64-bit integer vector of [4 x i16].
515
///
516
/// \headerfile <x86intrin.h>
517
///
518
/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
519
///
520
/// \param __m1
521
///    A 64-bit integer vector of [4 x i16] containing the minuends.
522
/// \param __m2
523
///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
524
/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
525
///    both parameters.
526
static __inline__ __m64 __DEFAULT_FN_ATTRS
527
_mm_sub_pi16(__m64 __m1, __m64 __m2)
528
{
529
    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
530
}
531
 
532
/// Subtracts each 32-bit integer element of the second 64-bit integer
533
///    vector of [2 x i32] from the corresponding 32-bit integer element of the
534
///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
535
///    results are packed into a 64-bit integer vector of [2 x i32].
536
///
537
/// \headerfile <x86intrin.h>
538
///
539
/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
540
///
541
/// \param __m1
542
///    A 64-bit integer vector of [2 x i32] containing the minuends.
543
/// \param __m2
544
///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
545
/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
546
///    both parameters.
547
static __inline__ __m64 __DEFAULT_FN_ATTRS
548
_mm_sub_pi32(__m64 __m1, __m64 __m2)
549
{
550
    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
551
}
552
 
553
/// Subtracts each 8-bit signed integer element of the second 64-bit
554
///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
555
///    element of the first 64-bit integer vector of [8 x i8]. Positive results
556
///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
557
///    are saturated to 0x80. The results are packed into a 64-bit integer
558
///    vector of [8 x i8].
559
///
560
/// \headerfile <x86intrin.h>
561
///
562
/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
563
///
564
/// \param __m1
565
///    A 64-bit integer vector of [8 x i8] containing the minuends.
566
/// \param __m2
567
///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
568
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
569
///    differences of both parameters.
570
static __inline__ __m64 __DEFAULT_FN_ATTRS
571
_mm_subs_pi8(__m64 __m1, __m64 __m2)
572
{
573
    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
574
}
575
 
576
/// Subtracts each 16-bit signed integer element of the second 64-bit
577
///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
578
///    element of the first 64-bit integer vector of [4 x i16]. Positive results
579
///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
580
///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
581
///    integer vector of [4 x i16].
582
///
583
/// \headerfile <x86intrin.h>
584
///
585
/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
586
///
587
/// \param __m1
588
///    A 64-bit integer vector of [4 x i16] containing the minuends.
589
/// \param __m2
590
///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
591
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
592
///    differences of both parameters.
593
static __inline__ __m64 __DEFAULT_FN_ATTRS
594
_mm_subs_pi16(__m64 __m1, __m64 __m2)
595
{
596
    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
597
}
598
 
599
/// Subtracts each 8-bit unsigned integer element of the second 64-bit
600
///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
601
///    element of the first 64-bit integer vector of [8 x i8].
602
///
603
///    If an element of the first vector is less than the corresponding element
604
///    of the second vector, the result is saturated to 0. The results are
605
///    packed into a 64-bit integer vector of [8 x i8].
606
///
607
/// \headerfile <x86intrin.h>
608
///
609
/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
610
///
611
/// \param __m1
612
///    A 64-bit integer vector of [8 x i8] containing the minuends.
613
/// \param __m2
614
///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
615
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
616
///    differences of both parameters.
617
static __inline__ __m64 __DEFAULT_FN_ATTRS
618
_mm_subs_pu8(__m64 __m1, __m64 __m2)
619
{
620
    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
621
}
622
 
623
/// Subtracts each 16-bit unsigned integer element of the second 64-bit
624
///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
625
///    integer element of the first 64-bit integer vector of [4 x i16].
626
///
627
///    If an element of the first vector is less than the corresponding element
628
///    of the second vector, the result is saturated to 0. The results are
629
///    packed into a 64-bit integer vector of [4 x i16].
630
///
631
/// \headerfile <x86intrin.h>
632
///
633
/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
634
///
635
/// \param __m1
636
///    A 64-bit integer vector of [4 x i16] containing the minuends.
637
/// \param __m2
638
///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
639
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
640
///    differences of both parameters.
641
static __inline__ __m64 __DEFAULT_FN_ATTRS
642
_mm_subs_pu16(__m64 __m1, __m64 __m2)
643
{
644
    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
645
}
646
 
647
/// Multiplies each 16-bit signed integer element of the first 64-bit
648
///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
649
///    element of the second 64-bit integer vector of [4 x i16] and get four
650
///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
651
///    The lower 32 bits of these two sums are packed into a 64-bit integer
652
///    vector of [2 x i32].
653
///
654
///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
655
///    of both parameters are multiplied, and the sum of both results is written
656
///    to bits [31:0] of the result.
657
///
658
/// \headerfile <x86intrin.h>
659
///
660
/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
661
///
662
/// \param __m1
663
///    A 64-bit integer vector of [4 x i16].
664
/// \param __m2
665
///    A 64-bit integer vector of [4 x i16].
666
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
667
///    products of both parameters.
668
static __inline__ __m64 __DEFAULT_FN_ATTRS
669
_mm_madd_pi16(__m64 __m1, __m64 __m2)
670
{
671
    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
672
}
673
 
674
/// Multiplies each 16-bit signed integer element of the first 64-bit
675
///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
676
///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
677
///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
678
///
679
/// \headerfile <x86intrin.h>
680
///
681
/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
682
///
683
/// \param __m1
684
///    A 64-bit integer vector of [4 x i16].
685
/// \param __m2
686
///    A 64-bit integer vector of [4 x i16].
687
/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
688
///    of the products of both parameters.
689
static __inline__ __m64 __DEFAULT_FN_ATTRS
690
_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
691
{
692
    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
693
}
694
 
695
/// Multiplies each 16-bit signed integer element of the first 64-bit
696
///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
697
///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
698
///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
699
///
700
/// \headerfile <x86intrin.h>
701
///
702
/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
703
///
704
/// \param __m1
705
///    A 64-bit integer vector of [4 x i16].
706
/// \param __m2
707
///    A 64-bit integer vector of [4 x i16].
708
/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
709
///    of the products of both parameters.
710
static __inline__ __m64 __DEFAULT_FN_ATTRS
711
_mm_mullo_pi16(__m64 __m1, __m64 __m2)
712
{
713
    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
714
}
715
 
716
/// Left-shifts each 16-bit signed integer element of the first
717
///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
718
///    of bits specified by the second parameter, which is a 64-bit integer. The
719
///    lower 16 bits of the results are packed into a 64-bit integer vector of
720
///    [4 x i16].
721
///
722
/// \headerfile <x86intrin.h>
723
///
724
/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
725
///
726
/// \param __m
727
///    A 64-bit integer vector of [4 x i16].
728
/// \param __count
729
///    A 64-bit integer vector interpreted as a single 64-bit integer.
730
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
731
///    values. If \a __count is greater or equal to 16, the result is set to all
732
///    0.
733
static __inline__ __m64 __DEFAULT_FN_ATTRS
734
_mm_sll_pi16(__m64 __m, __m64 __count)
735
{
736
    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
737
}
738
 
739
/// Left-shifts each 16-bit signed integer element of a 64-bit integer
740
///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
741
///    The lower 16 bits of the results are packed into a 64-bit integer vector
742
///    of [4 x i16].
743
///
744
/// \headerfile <x86intrin.h>
745
///
746
/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
747
///
748
/// \param __m
749
///    A 64-bit integer vector of [4 x i16].
750
/// \param __count
751
///    A 32-bit integer value.
752
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
753
///    values. If \a __count is greater or equal to 16, the result is set to all
754
///    0.
755
static __inline__ __m64 __DEFAULT_FN_ATTRS
756
_mm_slli_pi16(__m64 __m, int __count)
757
{
758
    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
759
}
760
 
761
/// Left-shifts each 32-bit signed integer element of the first
762
///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
763
///    of bits specified by the second parameter, which is a 64-bit integer. The
764
///    lower 32 bits of the results are packed into a 64-bit integer vector of
765
///    [2 x i32].
766
///
767
/// \headerfile <x86intrin.h>
768
///
769
/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
770
///
771
/// \param __m
772
///    A 64-bit integer vector of [2 x i32].
773
/// \param __count
774
///    A 64-bit integer vector interpreted as a single 64-bit integer.
775
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
776
///    values. If \a __count is greater or equal to 32, the result is set to all
777
///    0.
778
static __inline__ __m64 __DEFAULT_FN_ATTRS
779
_mm_sll_pi32(__m64 __m, __m64 __count)
780
{
781
    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
782
}
783
 
784
/// Left-shifts each 32-bit signed integer element of a 64-bit integer
785
///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
786
///    The lower 32 bits of the results are packed into a 64-bit integer vector
787
///    of [2 x i32].
788
///
789
/// \headerfile <x86intrin.h>
790
///
791
/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
792
///
793
/// \param __m
794
///    A 64-bit integer vector of [2 x i32].
795
/// \param __count
796
///    A 32-bit integer value.
797
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
798
///    values. If \a __count is greater or equal to 32, the result is set to all
799
///    0.
800
static __inline__ __m64 __DEFAULT_FN_ATTRS
801
_mm_slli_pi32(__m64 __m, int __count)
802
{
803
    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
804
}
805
 
806
/// Left-shifts the first 64-bit integer parameter by the number of bits
807
///    specified by the second 64-bit integer parameter. The lower 64 bits of
808
///    result are returned.
809
///
810
/// \headerfile <x86intrin.h>
811
///
812
/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
813
///
814
/// \param __m
815
///    A 64-bit integer vector interpreted as a single 64-bit integer.
816
/// \param __count
817
///    A 64-bit integer vector interpreted as a single 64-bit integer.
818
/// \returns A 64-bit integer vector containing the left-shifted value. If
819
///     \a __count is greater or equal to 64, the result is set to 0.
820
static __inline__ __m64 __DEFAULT_FN_ATTRS
821
_mm_sll_si64(__m64 __m, __m64 __count)
822
{
823
    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
824
}
825
 
826
/// Left-shifts the first parameter, which is a 64-bit integer, by the
827
///    number of bits specified by the second parameter, which is a 32-bit
828
///    integer. The lower 64 bits of result are returned.
829
///
830
/// \headerfile <x86intrin.h>
831
///
832
/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
833
///
834
/// \param __m
835
///    A 64-bit integer vector interpreted as a single 64-bit integer.
836
/// \param __count
837
///    A 32-bit integer value.
838
/// \returns A 64-bit integer vector containing the left-shifted value. If
839
///     \a __count is greater or equal to 64, the result is set to 0.
840
static __inline__ __m64 __DEFAULT_FN_ATTRS
841
_mm_slli_si64(__m64 __m, int __count)
842
{
843
    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
844
}
845
 
846
/// Right-shifts each 16-bit integer element of the first parameter,
847
///    which is a 64-bit integer vector of [4 x i16], by the number of bits
848
///    specified by the second parameter, which is a 64-bit integer.
849
///
850
///    High-order bits are filled with the sign bit of the initial value of each
851
///    16-bit element. The 16-bit results are packed into a 64-bit integer
852
///    vector of [4 x i16].
853
///
854
/// \headerfile <x86intrin.h>
855
///
856
/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
857
///
858
/// \param __m
859
///    A 64-bit integer vector of [4 x i16].
860
/// \param __count
861
///    A 64-bit integer vector interpreted as a single 64-bit integer.
862
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
863
///    values.
864
static __inline__ __m64 __DEFAULT_FN_ATTRS
865
_mm_sra_pi16(__m64 __m, __m64 __count)
866
{
867
    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
868
}
869
 
870
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
871
///    of [4 x i16] by the number of bits specified by a 32-bit integer.
872
///
873
///    High-order bits are filled with the sign bit of the initial value of each
874
///    16-bit element. The 16-bit results are packed into a 64-bit integer
875
///    vector of [4 x i16].
876
///
877
/// \headerfile <x86intrin.h>
878
///
879
/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
880
///
881
/// \param __m
882
///    A 64-bit integer vector of [4 x i16].
883
/// \param __count
884
///    A 32-bit integer value.
885
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
886
///    values.
887
static __inline__ __m64 __DEFAULT_FN_ATTRS
888
_mm_srai_pi16(__m64 __m, int __count)
889
{
890
    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
891
}
892
 
893
/// Right-shifts each 32-bit integer element of the first parameter,
894
///    which is a 64-bit integer vector of [2 x i32], by the number of bits
895
///    specified by the second parameter, which is a 64-bit integer.
896
///
897
///    High-order bits are filled with the sign bit of the initial value of each
898
///    32-bit element. The 32-bit results are packed into a 64-bit integer
899
///    vector of [2 x i32].
900
///
901
/// \headerfile <x86intrin.h>
902
///
903
/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
904
///
905
/// \param __m
906
///    A 64-bit integer vector of [2 x i32].
907
/// \param __count
908
///    A 64-bit integer vector interpreted as a single 64-bit integer.
909
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
910
///    values.
911
static __inline__ __m64 __DEFAULT_FN_ATTRS
912
_mm_sra_pi32(__m64 __m, __m64 __count)
913
{
914
    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
915
}
916
 
917
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
918
///    of [2 x i32] by the number of bits specified by a 32-bit integer.
919
///
920
///    High-order bits are filled with the sign bit of the initial value of each
921
///    32-bit element. The 32-bit results are packed into a 64-bit integer
922
///    vector of [2 x i32].
923
///
924
/// \headerfile <x86intrin.h>
925
///
926
/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
927
///
928
/// \param __m
929
///    A 64-bit integer vector of [2 x i32].
930
/// \param __count
931
///    A 32-bit integer value.
932
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
933
///    values.
934
static __inline__ __m64 __DEFAULT_FN_ATTRS
935
_mm_srai_pi32(__m64 __m, int __count)
936
{
937
    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
938
}
939
 
940
/// Right-shifts each 16-bit integer element of the first parameter,
941
///    which is a 64-bit integer vector of [4 x i16], by the number of bits
942
///    specified by the second parameter, which is a 64-bit integer.
943
///
944
///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
945
///    integer vector of [4 x i16].
946
///
947
/// \headerfile <x86intrin.h>
948
///
949
/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
950
///
951
/// \param __m
952
///    A 64-bit integer vector of [4 x i16].
953
/// \param __count
954
///    A 64-bit integer vector interpreted as a single 64-bit integer.
955
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
956
///    values.
957
static __inline__ __m64 __DEFAULT_FN_ATTRS
958
_mm_srl_pi16(__m64 __m, __m64 __count)
959
{
960
    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
961
}
962
 
963
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
964
///    of [4 x i16] by the number of bits specified by a 32-bit integer.
965
///
966
///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
967
///    integer vector of [4 x i16].
968
///
969
/// \headerfile <x86intrin.h>
970
///
971
/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
972
///
973
/// \param __m
974
///    A 64-bit integer vector of [4 x i16].
975
/// \param __count
976
///    A 32-bit integer value.
977
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
978
///    values.
979
static __inline__ __m64 __DEFAULT_FN_ATTRS
980
_mm_srli_pi16(__m64 __m, int __count)
981
{
982
    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
983
}
984
 
985
/// Right-shifts each 32-bit integer element of the first parameter,
986
///    which is a 64-bit integer vector of [2 x i32], by the number of bits
987
///    specified by the second parameter, which is a 64-bit integer.
988
///
989
///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
990
///    integer vector of [2 x i32].
991
///
992
/// \headerfile <x86intrin.h>
993
///
994
/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
995
///
996
/// \param __m
997
///    A 64-bit integer vector of [2 x i32].
998
/// \param __count
999
///    A 64-bit integer vector interpreted as a single 64-bit integer.
1000
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1001
///    values.
1002
static __inline__ __m64 __DEFAULT_FN_ATTRS
1003
_mm_srl_pi32(__m64 __m, __m64 __count)
1004
{
1005
    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
1006
}
1007
 
1008
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
1009
///    of [2 x i32] by the number of bits specified by a 32-bit integer.
1010
///
1011
///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1012
///    integer vector of [2 x i32].
1013
///
1014
/// \headerfile <x86intrin.h>
1015
///
1016
/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1017
///
1018
/// \param __m
1019
///    A 64-bit integer vector of [2 x i32].
1020
/// \param __count
1021
///    A 32-bit integer value.
1022
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1023
///    values.
1024
static __inline__ __m64 __DEFAULT_FN_ATTRS
1025
_mm_srli_pi32(__m64 __m, int __count)
1026
{
1027
    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
1028
}
1029
 
1030
/// Right-shifts the first 64-bit integer parameter by the number of bits
1031
///    specified by the second 64-bit integer parameter.
1032
///
1033
///    High-order bits are cleared.
1034
///
1035
/// \headerfile <x86intrin.h>
1036
///
1037
/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1038
///
1039
/// \param __m
1040
///    A 64-bit integer vector interpreted as a single 64-bit integer.
1041
/// \param __count
1042
///    A 64-bit integer vector interpreted as a single 64-bit integer.
1043
/// \returns A 64-bit integer vector containing the right-shifted value.
1044
static __inline__ __m64 __DEFAULT_FN_ATTRS
1045
_mm_srl_si64(__m64 __m, __m64 __count)
1046
{
1047
    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
1048
}
1049
 
1050
/// Right-shifts the first parameter, which is a 64-bit integer, by the
1051
///    number of bits specified by the second parameter, which is a 32-bit
1052
///    integer.
1053
///
1054
///    High-order bits are cleared.
1055
///
1056
/// \headerfile <x86intrin.h>
1057
///
1058
/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1059
///
1060
/// \param __m
1061
///    A 64-bit integer vector interpreted as a single 64-bit integer.
1062
/// \param __count
1063
///    A 32-bit integer value.
1064
/// \returns A 64-bit integer vector containing the right-shifted value.
1065
static __inline__ __m64 __DEFAULT_FN_ATTRS
1066
_mm_srli_si64(__m64 __m, int __count)
1067
{
1068
    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
1069
}
1070
 
1071
/// Performs a bitwise AND of two 64-bit integer vectors.
1072
///
1073
/// \headerfile <x86intrin.h>
1074
///
1075
/// This intrinsic corresponds to the <c> PAND </c> instruction.
1076
///
1077
/// \param __m1
1078
///    A 64-bit integer vector.
1079
/// \param __m2
1080
///    A 64-bit integer vector.
1081
/// \returns A 64-bit integer vector containing the bitwise AND of both
1082
///    parameters.
1083
static __inline__ __m64 __DEFAULT_FN_ATTRS
1084
_mm_and_si64(__m64 __m1, __m64 __m2)
1085
{
1086
    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
1087
}
1088
 
1089
/// Performs a bitwise NOT of the first 64-bit integer vector, and then
1090
///    performs a bitwise AND of the intermediate result and the second 64-bit
1091
///    integer vector.
1092
///
1093
/// \headerfile <x86intrin.h>
1094
///
1095
/// This intrinsic corresponds to the <c> PANDN </c> instruction.
1096
///
1097
/// \param __m1
1098
///    A 64-bit integer vector. The one's complement of this parameter is used
1099
///    in the bitwise AND.
1100
/// \param __m2
1101
///    A 64-bit integer vector.
1102
/// \returns A 64-bit integer vector containing the bitwise AND of the second
1103
///    parameter and the one's complement of the first parameter.
1104
static __inline__ __m64 __DEFAULT_FN_ATTRS
1105
_mm_andnot_si64(__m64 __m1, __m64 __m2)
1106
{
1107
    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
1108
}
1109
 
1110
/// Performs a bitwise OR of two 64-bit integer vectors.
1111
///
1112
/// \headerfile <x86intrin.h>
1113
///
1114
/// This intrinsic corresponds to the <c> POR </c> instruction.
1115
///
1116
/// \param __m1
1117
///    A 64-bit integer vector.
1118
/// \param __m2
1119
///    A 64-bit integer vector.
1120
/// \returns A 64-bit integer vector containing the bitwise OR of both
1121
///    parameters.
1122
static __inline__ __m64 __DEFAULT_FN_ATTRS
1123
_mm_or_si64(__m64 __m1, __m64 __m2)
1124
{
1125
    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
1126
}
1127
 
1128
/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
1129
///
1130
/// \headerfile <x86intrin.h>
1131
///
1132
/// This intrinsic corresponds to the <c> PXOR </c> instruction.
1133
///
1134
/// \param __m1
1135
///    A 64-bit integer vector.
1136
/// \param __m2
1137
///    A 64-bit integer vector.
1138
/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1139
///    parameters.
1140
static __inline__ __m64 __DEFAULT_FN_ATTRS
1141
_mm_xor_si64(__m64 __m1, __m64 __m2)
1142
{
1143
    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
1144
}
1145
 
1146
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
1147
///    [8 x i8] to determine if the element of the first vector is equal to the
1148
///    corresponding element of the second vector.
1149
///
1150
///    The comparison yields 0 for false, 0xFF for true.
1151
///
1152
/// \headerfile <x86intrin.h>
1153
///
1154
/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
1155
///
1156
/// \param __m1
1157
///    A 64-bit integer vector of [8 x i8].
1158
/// \param __m2
1159
///    A 64-bit integer vector of [8 x i8].
1160
/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1161
///    results.
1162
static __inline__ __m64 __DEFAULT_FN_ATTRS
1163
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1164
{
1165
    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
1166
}
1167
 
1168
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
1169
///    [4 x i16] to determine if the element of the first vector is equal to the
1170
///    corresponding element of the second vector.
1171
///
1172
///    The comparison yields 0 for false, 0xFFFF for true.
1173
///
1174
/// \headerfile <x86intrin.h>
1175
///
1176
/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
1177
///
1178
/// \param __m1
1179
///    A 64-bit integer vector of [4 x i16].
1180
/// \param __m2
1181
///    A 64-bit integer vector of [4 x i16].
1182
/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1183
///    results.
1184
static __inline__ __m64 __DEFAULT_FN_ATTRS
1185
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1186
{
1187
    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
1188
}
1189
 
1190
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
1191
///    [2 x i32] to determine if the element of the first vector is equal to the
1192
///    corresponding element of the second vector.
1193
///
1194
///    The comparison yields 0 for false, 0xFFFFFFFF for true.
1195
///
1196
/// \headerfile <x86intrin.h>
1197
///
1198
/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
1199
///
1200
/// \param __m1
1201
///    A 64-bit integer vector of [2 x i32].
1202
/// \param __m2
1203
///    A 64-bit integer vector of [2 x i32].
1204
/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1205
///    results.
1206
static __inline__ __m64 __DEFAULT_FN_ATTRS
1207
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1208
{
1209
    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
1210
}
1211
 
1212
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
1213
///    [8 x i8] to determine if the element of the first vector is greater than
1214
///    the corresponding element of the second vector.
1215
///
1216
///    The comparison yields 0 for false, 0xFF for true.
1217
///
1218
/// \headerfile <x86intrin.h>
1219
///
1220
/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
1221
///
1222
/// \param __m1
1223
///    A 64-bit integer vector of [8 x i8].
1224
/// \param __m2
1225
///    A 64-bit integer vector of [8 x i8].
1226
/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1227
///    results.
1228
static __inline__ __m64 __DEFAULT_FN_ATTRS
1229
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1230
{
1231
    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
1232
}
1233
 
1234
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
1235
///    [4 x i16] to determine if the element of the first vector is greater than
1236
///    the corresponding element of the second vector.
1237
///
1238
///    The comparison yields 0 for false, 0xFFFF for true.
1239
///
1240
/// \headerfile <x86intrin.h>
1241
///
1242
/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
1243
///
1244
/// \param __m1
1245
///    A 64-bit integer vector of [4 x i16].
1246
/// \param __m2
1247
///    A 64-bit integer vector of [4 x i16].
1248
/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1249
///    results.
1250
static __inline__ __m64 __DEFAULT_FN_ATTRS
1251
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1252
{
1253
    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
1254
}
1255
 
1256
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
1257
///    [2 x i32] to determine if the element of the first vector is greater than
1258
///    the corresponding element of the second vector.
1259
///
1260
///    The comparison yields 0 for false, 0xFFFFFFFF for true.
1261
///
1262
/// \headerfile <x86intrin.h>
1263
///
1264
/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
1265
///
1266
/// \param __m1
1267
///    A 64-bit integer vector of [2 x i32].
1268
/// \param __m2
1269
///    A 64-bit integer vector of [2 x i32].
1270
/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1271
///    results.
1272
static __inline__ __m64 __DEFAULT_FN_ATTRS
1273
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1274
{
1275
    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
1276
}
1277
 
1278
/// Constructs a 64-bit integer vector initialized to zero.
1279
///
1280
/// \headerfile <x86intrin.h>
1281
///
1282
/// This intrinsic corresponds to the <c> PXOR </c> instruction.
1283
///
1284
/// \returns An initialized 64-bit integer vector with all elements set to zero.
1285
static __inline__ __m64 __DEFAULT_FN_ATTRS
1286
_mm_setzero_si64(void)
1287
{
1288
    return __extension__ (__m64){ 0LL };
1289
}
1290
 
1291
/// Constructs a 64-bit integer vector initialized with the specified
1292
///    32-bit integer values.
1293
///
1294
/// \headerfile <x86intrin.h>
1295
///
1296
/// This intrinsic is a utility function and does not correspond to a specific
1297
///    instruction.
1298
///
1299
/// \param __i1
1300
///    A 32-bit integer value used to initialize the upper 32 bits of the
1301
///    result.
1302
/// \param __i0
1303
///    A 32-bit integer value used to initialize the lower 32 bits of the
1304
///    result.
1305
/// \returns An initialized 64-bit integer vector.
1306
static __inline__ __m64 __DEFAULT_FN_ATTRS
1307
_mm_set_pi32(int __i1, int __i0)
1308
{
1309
    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
1310
}
1311
 
1312
/// Constructs a 64-bit integer vector initialized with the specified
1313
///    16-bit integer values.
1314
///
1315
/// \headerfile <x86intrin.h>
1316
///
1317
/// This intrinsic is a utility function and does not correspond to a specific
1318
///    instruction.
1319
///
1320
/// \param __s3
1321
///    A 16-bit integer value used to initialize bits [63:48] of the result.
1322
/// \param __s2
1323
///    A 16-bit integer value used to initialize bits [47:32] of the result.
1324
/// \param __s1
1325
///    A 16-bit integer value used to initialize bits [31:16] of the result.
1326
/// \param __s0
1327
///    A 16-bit integer value used to initialize bits [15:0] of the result.
1328
/// \returns An initialized 64-bit integer vector.
1329
static __inline__ __m64 __DEFAULT_FN_ATTRS
1330
_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
1331
{
1332
    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
1333
}
1334
 
1335
/// Constructs a 64-bit integer vector initialized with the specified
1336
///    8-bit integer values.
1337
///
1338
/// \headerfile <x86intrin.h>
1339
///
1340
/// This intrinsic is a utility function and does not correspond to a specific
1341
///    instruction.
1342
///
1343
/// \param __b7
1344
///    An 8-bit integer value used to initialize bits [63:56] of the result.
1345
/// \param __b6
1346
///    An 8-bit integer value used to initialize bits [55:48] of the result.
1347
/// \param __b5
1348
///    An 8-bit integer value used to initialize bits [47:40] of the result.
1349
/// \param __b4
1350
///    An 8-bit integer value used to initialize bits [39:32] of the result.
1351
/// \param __b3
1352
///    An 8-bit integer value used to initialize bits [31:24] of the result.
1353
/// \param __b2
1354
///    An 8-bit integer value used to initialize bits [23:16] of the result.
1355
/// \param __b1
1356
///    An 8-bit integer value used to initialize bits [15:8] of the result.
1357
/// \param __b0
1358
///    An 8-bit integer value used to initialize bits [7:0] of the result.
1359
/// \returns An initialized 64-bit integer vector.
1360
static __inline__ __m64 __DEFAULT_FN_ATTRS
1361
_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1362
            char __b1, char __b0)
1363
{
1364
    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
1365
                                               __b4, __b5, __b6, __b7);
1366
}
1367
 
1368
/// Constructs a 64-bit integer vector of [2 x i32], with each of the
1369
///    32-bit integer vector elements set to the specified 32-bit integer
1370
///    value.
1371
///
1372
/// \headerfile <x86intrin.h>
1373
///
1374
/// This intrinsic is a utility function and does not correspond to a specific
1375
///    instruction.
1376
///
1377
/// \param __i
1378
///    A 32-bit integer value used to initialize each vector element of the
1379
///    result.
1380
/// \returns An initialized 64-bit integer vector of [2 x i32].
1381
static __inline__ __m64 __DEFAULT_FN_ATTRS
1382
_mm_set1_pi32(int __i)
1383
{
1384
    return _mm_set_pi32(__i, __i);
1385
}
1386
 
1387
/// Constructs a 64-bit integer vector of [4 x i16], with each of the
1388
///    16-bit integer vector elements set to the specified 16-bit integer
1389
///    value.
1390
///
1391
/// \headerfile <x86intrin.h>
1392
///
1393
/// This intrinsic is a utility function and does not correspond to a specific
1394
///    instruction.
1395
///
1396
/// \param __w
1397
///    A 16-bit integer value used to initialize each vector element of the
1398
///    result.
1399
/// \returns An initialized 64-bit integer vector of [4 x i16].
1400
static __inline__ __m64 __DEFAULT_FN_ATTRS
1401
_mm_set1_pi16(short __w)
1402
{
1403
    return _mm_set_pi16(__w, __w, __w, __w);
1404
}
1405
 
1406
/// Constructs a 64-bit integer vector of [8 x i8], with each of the
1407
///    8-bit integer vector elements set to the specified 8-bit integer value.
1408
///
1409
/// \headerfile <x86intrin.h>
1410
///
1411
/// This intrinsic is a utility function and does not correspond to a specific
1412
///    instruction.
1413
///
1414
/// \param __b
1415
///    An 8-bit integer value used to initialize each vector element of the
1416
///    result.
1417
/// \returns An initialized 64-bit integer vector of [8 x i8].
1418
static __inline__ __m64 __DEFAULT_FN_ATTRS
1419
_mm_set1_pi8(char __b)
1420
{
1421
    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1422
}
1423
 
1424
/// Constructs a 64-bit integer vector, initialized in reverse order with
1425
///    the specified 32-bit integer values.
1426
///
1427
/// \headerfile <x86intrin.h>
1428
///
1429
/// This intrinsic is a utility function and does not correspond to a specific
1430
///    instruction.
1431
///
1432
/// \param __i0
1433
///    A 32-bit integer value used to initialize the lower 32 bits of the
1434
///    result.
1435
/// \param __i1
1436
///    A 32-bit integer value used to initialize the upper 32 bits of the
1437
///    result.
1438
/// \returns An initialized 64-bit integer vector.
1439
static __inline__ __m64 __DEFAULT_FN_ATTRS
1440
_mm_setr_pi32(int __i0, int __i1)
1441
{
1442
    return _mm_set_pi32(__i1, __i0);
1443
}
1444
 
1445
/// Constructs a 64-bit integer vector, initialized in reverse order with
1446
///    the specified 16-bit integer values.
1447
///
1448
/// \headerfile <x86intrin.h>
1449
///
1450
/// This intrinsic is a utility function and does not correspond to a specific
1451
///    instruction.
1452
///
1453
/// \param __w0
1454
///    A 16-bit integer value used to initialize bits [15:0] of the result.
1455
/// \param __w1
1456
///    A 16-bit integer value used to initialize bits [31:16] of the result.
1457
/// \param __w2
1458
///    A 16-bit integer value used to initialize bits [47:32] of the result.
1459
/// \param __w3
1460
///    A 16-bit integer value used to initialize bits [63:48] of the result.
1461
/// \returns An initialized 64-bit integer vector.
1462
static __inline__ __m64 __DEFAULT_FN_ATTRS
1463
_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
1464
{
1465
    return _mm_set_pi16(__w3, __w2, __w1, __w0);
1466
}
1467
 
1468
/// Constructs a 64-bit integer vector, initialized in reverse order with
1469
///    the specified 8-bit integer values.
1470
///
1471
/// \headerfile <x86intrin.h>
1472
///
1473
/// This intrinsic is a utility function and does not correspond to a specific
1474
///    instruction.
1475
///
1476
/// \param __b0
1477
///    An 8-bit integer value used to initialize bits [7:0] of the result.
1478
/// \param __b1
1479
///    An 8-bit integer value used to initialize bits [15:8] of the result.
1480
/// \param __b2
1481
///    An 8-bit integer value used to initialize bits [23:16] of the result.
1482
/// \param __b3
1483
///    An 8-bit integer value used to initialize bits [31:24] of the result.
1484
/// \param __b4
1485
///    An 8-bit integer value used to initialize bits [39:32] of the result.
1486
/// \param __b5
1487
///    An 8-bit integer value used to initialize bits [47:40] of the result.
1488
/// \param __b6
1489
///    An 8-bit integer value used to initialize bits [55:48] of the result.
1490
/// \param __b7
1491
///    An 8-bit integer value used to initialize bits [63:56] of the result.
1492
/// \returns An initialized 64-bit integer vector.
1493
static __inline__ __m64 __DEFAULT_FN_ATTRS
1494
_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1495
             char __b6, char __b7)
1496
{
1497
    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1498
}
1499
 
1500
#undef __DEFAULT_FN_ATTRS
1501
 
1502
/* Aliases for compatibility. */
1503
#define _m_empty _mm_empty
1504
#define _m_from_int _mm_cvtsi32_si64
1505
#define _m_from_int64 _mm_cvtsi64_m64
1506
#define _m_to_int _mm_cvtsi64_si32
1507
#define _m_to_int64 _mm_cvtm64_si64
1508
#define _m_packsswb _mm_packs_pi16
1509
#define _m_packssdw _mm_packs_pi32
1510
#define _m_packuswb _mm_packs_pu16
1511
#define _m_punpckhbw _mm_unpackhi_pi8
1512
#define _m_punpckhwd _mm_unpackhi_pi16
1513
#define _m_punpckhdq _mm_unpackhi_pi32
1514
#define _m_punpcklbw _mm_unpacklo_pi8
1515
#define _m_punpcklwd _mm_unpacklo_pi16
1516
#define _m_punpckldq _mm_unpacklo_pi32
1517
#define _m_paddb _mm_add_pi8
1518
#define _m_paddw _mm_add_pi16
1519
#define _m_paddd _mm_add_pi32
1520
#define _m_paddsb _mm_adds_pi8
1521
#define _m_paddsw _mm_adds_pi16
1522
#define _m_paddusb _mm_adds_pu8
1523
#define _m_paddusw _mm_adds_pu16
1524
#define _m_psubb _mm_sub_pi8
1525
#define _m_psubw _mm_sub_pi16
1526
#define _m_psubd _mm_sub_pi32
1527
#define _m_psubsb _mm_subs_pi8
1528
#define _m_psubsw _mm_subs_pi16
1529
#define _m_psubusb _mm_subs_pu8
1530
#define _m_psubusw _mm_subs_pu16
1531
#define _m_pmaddwd _mm_madd_pi16
1532
#define _m_pmulhw _mm_mulhi_pi16
1533
#define _m_pmullw _mm_mullo_pi16
1534
#define _m_psllw _mm_sll_pi16
1535
#define _m_psllwi _mm_slli_pi16
1536
#define _m_pslld _mm_sll_pi32
1537
#define _m_pslldi _mm_slli_pi32
1538
#define _m_psllq _mm_sll_si64
1539
#define _m_psllqi _mm_slli_si64
1540
#define _m_psraw _mm_sra_pi16
1541
#define _m_psrawi _mm_srai_pi16
1542
#define _m_psrad _mm_sra_pi32
1543
#define _m_psradi _mm_srai_pi32
1544
#define _m_psrlw _mm_srl_pi16
1545
#define _m_psrlwi _mm_srli_pi16
1546
#define _m_psrld _mm_srl_pi32
1547
#define _m_psrldi _mm_srli_pi32
1548
#define _m_psrlq _mm_srl_si64
1549
#define _m_psrlqi _mm_srli_si64
1550
#define _m_pand _mm_and_si64
1551
#define _m_pandn _mm_andnot_si64
1552
#define _m_por _mm_or_si64
1553
#define _m_pxor _mm_xor_si64
1554
#define _m_pcmpeqb _mm_cmpeq_pi8
1555
#define _m_pcmpeqw _mm_cmpeq_pi16
1556
#define _m_pcmpeqd _mm_cmpeq_pi32
1557
#define _m_pcmpgtb _mm_cmpgt_pi8
1558
#define _m_pcmpgtw _mm_cmpgt_pi16
1559
#define _m_pcmpgtd _mm_cmpgt_pi32
1560
 
1561
#endif /* __MMINTRIN_H */
1562