Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
#ifndef __SMMINTRIN_H
11
#define __SMMINTRIN_H
12
 
13
#if !defined(__i386__) && !defined(__x86_64__)
14
#error "This header is only meant to be used on x86 and x64 architecture"
15
#endif
16
 
17
#include <tmmintrin.h>
18
 
19
/* Define the default attributes for the functions in this file. */
20
#define __DEFAULT_FN_ATTRS                                                     \
21
  __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
22
                 __min_vector_width__(128)))
23
 
24
/* SSE4 Rounding macros. */
25
#define _MM_FROUND_TO_NEAREST_INT 0x00
26
#define _MM_FROUND_TO_NEG_INF 0x01
27
#define _MM_FROUND_TO_POS_INF 0x02
28
#define _MM_FROUND_TO_ZERO 0x03
29
#define _MM_FROUND_CUR_DIRECTION 0x04
30
 
31
#define _MM_FROUND_RAISE_EXC 0x00
32
#define _MM_FROUND_NO_EXC 0x08
33
 
34
#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
35
#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
36
#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
37
#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
38
#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
39
#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
40
 
41
/// Rounds up each element of the 128-bit vector of [4 x float] to an
42
///    integer and returns the rounded values in a 128-bit vector of
43
///    [4 x float].
44
///
45
/// \headerfile <x86intrin.h>
46
///
47
/// \code
48
/// __m128 _mm_ceil_ps(__m128 X);
49
/// \endcode
50
///
51
/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
52
///
53
/// \param X
54
///    A 128-bit vector of [4 x float] values to be rounded up.
55
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
56
#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
57
 
58
/// Rounds up each element of the 128-bit vector of [2 x double] to an
59
///    integer and returns the rounded values in a 128-bit vector of
60
///    [2 x double].
61
///
62
/// \headerfile <x86intrin.h>
63
///
64
/// \code
65
/// __m128d _mm_ceil_pd(__m128d X);
66
/// \endcode
67
///
68
/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
69
///
70
/// \param X
71
///    A 128-bit vector of [2 x double] values to be rounded up.
72
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
73
#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
74
 
75
/// Copies three upper elements of the first 128-bit vector operand to
76
///    the corresponding three upper elements of the 128-bit result vector of
77
///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
78
///    operand to an integer and copies it to the lowest element of the 128-bit
79
///    result vector of [4 x float].
80
///
81
/// \headerfile <x86intrin.h>
82
///
83
/// \code
84
/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
85
/// \endcode
86
///
87
/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
88
///
89
/// \param X
90
///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
91
///    copied to the corresponding bits of the result.
92
/// \param Y
93
///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
94
///    rounded up to the nearest integer and copied to the corresponding bits
95
///    of the result.
96
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
97
///    values.
98
#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
99
 
100
/// Copies the upper element of the first 128-bit vector operand to the
101
///    corresponding upper element of the 128-bit result vector of [2 x double].
102
///    Rounds up the lower element of the second 128-bit vector operand to an
103
///    integer and copies it to the lower element of the 128-bit result vector
104
///    of [2 x double].
105
///
106
/// \headerfile <x86intrin.h>
107
///
108
/// \code
109
/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
110
/// \endcode
111
///
112
/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
113
///
114
/// \param X
115
///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
116
///    copied to the corresponding bits of the result.
117
/// \param Y
118
///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
119
///    rounded up to the nearest integer and copied to the corresponding bits
120
///    of the result.
121
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
122
///    values.
123
#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
124
 
125
/// Rounds down each element of the 128-bit vector of [4 x float] to an
126
///    an integer and returns the rounded values in a 128-bit vector of
127
///    [4 x float].
128
///
129
/// \headerfile <x86intrin.h>
130
///
131
/// \code
132
/// __m128 _mm_floor_ps(__m128 X);
133
/// \endcode
134
///
135
/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
136
///
137
/// \param X
138
///    A 128-bit vector of [4 x float] values to be rounded down.
139
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
140
#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
141
 
142
/// Rounds down each element of the 128-bit vector of [2 x double] to an
143
///    integer and returns the rounded values in a 128-bit vector of
144
///    [2 x double].
145
///
146
/// \headerfile <x86intrin.h>
147
///
148
/// \code
149
/// __m128d _mm_floor_pd(__m128d X);
150
/// \endcode
151
///
152
/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
153
///
154
/// \param X
155
///    A 128-bit vector of [2 x double].
156
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
157
#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
158
 
159
/// Copies three upper elements of the first 128-bit vector operand to
160
///    the corresponding three upper elements of the 128-bit result vector of
161
///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
162
///    operand to an integer and copies it to the lowest element of the 128-bit
163
///    result vector of [4 x float].
164
///
165
/// \headerfile <x86intrin.h>
166
///
167
/// \code
168
/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
169
/// \endcode
170
///
171
/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
172
///
173
/// \param X
174
///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
175
///    copied to the corresponding bits of the result.
176
/// \param Y
177
///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
178
///    rounded down to the nearest integer and copied to the corresponding bits
179
///    of the result.
180
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
181
///    values.
182
#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
183
 
184
/// Copies the upper element of the first 128-bit vector operand to the
185
///    corresponding upper element of the 128-bit result vector of [2 x double].
186
///    Rounds down the lower element of the second 128-bit vector operand to an
187
///    integer and copies it to the lower element of the 128-bit result vector
188
///    of [2 x double].
189
///
190
/// \headerfile <x86intrin.h>
191
///
192
/// \code
193
/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
194
/// \endcode
195
///
196
/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
197
///
198
/// \param X
199
///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
200
///    copied to the corresponding bits of the result.
201
/// \param Y
202
///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
203
///    rounded down to the nearest integer and copied to the corresponding bits
204
///    of the result.
205
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
206
///    values.
207
#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
208
 
209
/// Rounds each element of the 128-bit vector of [4 x float] to an
210
///    integer value according to the rounding control specified by the second
211
///    argument and returns the rounded values in a 128-bit vector of
212
///    [4 x float].
213
///
214
/// \headerfile <x86intrin.h>
215
///
216
/// \code
217
/// __m128 _mm_round_ps(__m128 X, const int M);
218
/// \endcode
219
///
220
/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
221
///
222
/// \param X
223
///    A 128-bit vector of [4 x float].
224
/// \param M
225
///    An integer value that specifies the rounding operation. \n
226
///    Bits [7:4] are reserved. \n
227
///    Bit [3] is a precision exception value: \n
228
///      0: A normal PE exception is used \n
229
///      1: The PE field is not updated \n
230
///    Bit [2] is the rounding control source: \n
231
///      0: Use bits [1:0] of \a M \n
232
///      1: Use the current MXCSR setting \n
233
///    Bits [1:0] contain the rounding control definition: \n
234
///      00: Nearest \n
235
///      01: Downward (toward negative infinity) \n
236
///      10: Upward (toward positive infinity) \n
237
///      11: Truncated
238
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
239
#define _mm_round_ps(X, M)                                                     \
240
  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
241
 
242
/// Copies three upper elements of the first 128-bit vector operand to
243
///    the corresponding three upper elements of the 128-bit result vector of
244
///    [4 x float]. Rounds the lowest element of the second 128-bit vector
245
///    operand to an integer value according to the rounding control specified
246
///    by the third argument and copies it to the lowest element of the 128-bit
247
///    result vector of [4 x float].
248
///
249
/// \headerfile <x86intrin.h>
250
///
251
/// \code
252
/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
253
/// \endcode
254
///
255
/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
256
///
257
/// \param X
258
///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
259
///    copied to the corresponding bits of the result.
260
/// \param Y
261
///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
262
///    rounded to the nearest integer using the specified rounding control and
263
///    copied to the corresponding bits of the result.
264
/// \param M
265
///    An integer value that specifies the rounding operation. \n
266
///    Bits [7:4] are reserved. \n
267
///    Bit [3] is a precision exception value: \n
268
///      0: A normal PE exception is used \n
269
///      1: The PE field is not updated \n
270
///    Bit [2] is the rounding control source: \n
271
///      0: Use bits [1:0] of \a M \n
272
///      1: Use the current MXCSR setting \n
273
///    Bits [1:0] contain the rounding control definition: \n
274
///      00: Nearest \n
275
///      01: Downward (toward negative infinity) \n
276
///      10: Upward (toward positive infinity) \n
277
///      11: Truncated
278
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
279
///    values.
280
#define _mm_round_ss(X, Y, M)                                                  \
281
  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
282
                                  (M)))
283
 
284
/// Rounds each element of the 128-bit vector of [2 x double] to an
285
///    integer value according to the rounding control specified by the second
286
///    argument and returns the rounded values in a 128-bit vector of
287
///    [2 x double].
288
///
289
/// \headerfile <x86intrin.h>
290
///
291
/// \code
292
/// __m128d _mm_round_pd(__m128d X, const int M);
293
/// \endcode
294
///
295
/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
296
///
297
/// \param X
298
///    A 128-bit vector of [2 x double].
299
/// \param M
300
///    An integer value that specifies the rounding operation. \n
301
///    Bits [7:4] are reserved. \n
302
///    Bit [3] is a precision exception value: \n
303
///      0: A normal PE exception is used \n
304
///      1: The PE field is not updated \n
305
///    Bit [2] is the rounding control source: \n
306
///      0: Use bits [1:0] of \a M \n
307
///      1: Use the current MXCSR setting \n
308
///    Bits [1:0] contain the rounding control definition: \n
309
///      00: Nearest \n
310
///      01: Downward (toward negative infinity) \n
311
///      10: Upward (toward positive infinity) \n
312
///      11: Truncated
313
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
314
#define _mm_round_pd(X, M)                                                     \
315
  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
316
 
317
/// Copies the upper element of the first 128-bit vector operand to the
318
///    corresponding upper element of the 128-bit result vector of [2 x double].
319
///    Rounds the lower element of the second 128-bit vector operand to an
320
///    integer value according to the rounding control specified by the third
321
///    argument and copies it to the lower element of the 128-bit result vector
322
///    of [2 x double].
323
///
324
/// \headerfile <x86intrin.h>
325
///
326
/// \code
327
/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
328
/// \endcode
329
///
330
/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
331
///
332
/// \param X
333
///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
334
///    copied to the corresponding bits of the result.
335
/// \param Y
336
///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
337
///    rounded to the nearest integer using the specified rounding control and
338
///    copied to the corresponding bits of the result.
339
/// \param M
340
///    An integer value that specifies the rounding operation. \n
341
///    Bits [7:4] are reserved. \n
342
///    Bit [3] is a precision exception value: \n
343
///      0: A normal PE exception is used \n
344
///      1: The PE field is not updated \n
345
///    Bit [2] is the rounding control source: \n
346
///      0: Use bits [1:0] of \a M \n
347
///      1: Use the current MXCSR setting \n
348
///    Bits [1:0] contain the rounding control definition: \n
349
///      00: Nearest \n
350
///      01: Downward (toward negative infinity) \n
351
///      10: Upward (toward positive infinity) \n
352
///      11: Truncated
353
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
354
///    values.
355
#define _mm_round_sd(X, Y, M)                                                  \
356
  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
357
                                   (M)))
358
 
359
/* SSE4 Packed Blending Intrinsics.  */
360
/// Returns a 128-bit vector of [2 x double] where the values are
361
///    selected from either the first or second operand as specified by the
362
///    third operand, the control mask.
363
///
364
/// \headerfile <x86intrin.h>
365
///
366
/// \code
367
/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
368
/// \endcode
369
///
370
/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
371
///
372
/// \param V1
373
///    A 128-bit vector of [2 x double].
374
/// \param V2
375
///    A 128-bit vector of [2 x double].
376
/// \param M
377
///    An immediate integer operand, with mask bits [1:0] specifying how the
378
///    values are to be copied. The position of the mask bit corresponds to the
379
///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
380
///    element in operand \a V1 is copied to the same position in the result.
381
///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
382
///    is copied to the same position in the result.
383
/// \returns A 128-bit vector of [2 x double] containing the copied values.
384
#define _mm_blend_pd(V1, V2, M)                                                \
385
  ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
386
                                   (__v2df)(__m128d)(V2), (int)(M)))
387
 
388
/// Returns a 128-bit vector of [4 x float] where the values are selected
389
///    from either the first or second operand as specified by the third
390
///    operand, the control mask.
391
///
392
/// \headerfile <x86intrin.h>
393
///
394
/// \code
395
/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
396
/// \endcode
397
///
398
/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
399
///
400
/// \param V1
401
///    A 128-bit vector of [4 x float].
402
/// \param V2
403
///    A 128-bit vector of [4 x float].
404
/// \param M
405
///    An immediate integer operand, with mask bits [3:0] specifying how the
406
///    values are to be copied. The position of the mask bit corresponds to the
407
///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
408
///    element in operand \a V1 is copied to the same position in the result.
409
///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
410
///    is copied to the same position in the result.
411
/// \returns A 128-bit vector of [4 x float] containing the copied values.
412
#define _mm_blend_ps(V1, V2, M)                                                \
413
  ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
414
                                  (int)(M)))
415
 
416
/// Returns a 128-bit vector of [2 x double] where the values are
417
///    selected from either the first or second operand as specified by the
418
///    third operand, the control mask.
419
///
420
/// \headerfile <x86intrin.h>
421
///
422
/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
423
///
424
/// \param __V1
425
///    A 128-bit vector of [2 x double].
426
/// \param __V2
427
///    A 128-bit vector of [2 x double].
428
/// \param __M
429
///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
430
///    values are to be copied. The position of the mask bit corresponds to the
431
///    most significant bit of a copied value. When a mask bit is 0, the
432
///    corresponding 64-bit element in operand \a __V1 is copied to the same
433
///    position in the result. When a mask bit is 1, the corresponding 64-bit
434
///    element in operand \a __V2 is copied to the same position in the result.
435
/// \returns A 128-bit vector of [2 x double] containing the copied values.
436
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
437
                                                           __m128d __V2,
438
                                                           __m128d __M) {
439
  return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
440
                                          (__v2df)__M);
441
}
442
 
443
/// Returns a 128-bit vector of [4 x float] where the values are
444
///    selected from either the first or second operand as specified by the
445
///    third operand, the control mask.
446
///
447
/// \headerfile <x86intrin.h>
448
///
449
/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
450
///
451
/// \param __V1
452
///    A 128-bit vector of [4 x float].
453
/// \param __V2
454
///    A 128-bit vector of [4 x float].
455
/// \param __M
456
///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
457
///    how the values are to be copied. The position of the mask bit corresponds
458
///    to the most significant bit of a copied value. When a mask bit is 0, the
459
///    corresponding 32-bit element in operand \a __V1 is copied to the same
460
///    position in the result. When a mask bit is 1, the corresponding 32-bit
461
///    element in operand \a __V2 is copied to the same position in the result.
462
/// \returns A 128-bit vector of [4 x float] containing the copied values.
463
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
464
                                                          __m128 __V2,
465
                                                          __m128 __M) {
466
  return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
467
                                         (__v4sf)__M);
468
}
469
 
470
/// Returns a 128-bit vector of [16 x i8] where the values are selected
471
///    from either of the first or second operand as specified by the third
472
///    operand, the control mask.
473
///
474
/// \headerfile <x86intrin.h>
475
///
476
/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
477
///
478
/// \param __V1
479
///    A 128-bit vector of [16 x i8].
480
/// \param __V2
481
///    A 128-bit vector of [16 x i8].
482
/// \param __M
483
///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
484
///    how the values are to be copied. The position of the mask bit corresponds
485
///    to the most significant bit of a copied value. When a mask bit is 0, the
486
///    corresponding 8-bit element in operand \a __V1 is copied to the same
487
///    position in the result. When a mask bit is 1, the corresponding 8-bit
488
///    element in operand \a __V2 is copied to the same position in the result.
489
/// \returns A 128-bit vector of [16 x i8] containing the copied values.
490
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
491
                                                             __m128i __V2,
492
                                                             __m128i __M) {
493
  return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
494
                                             (__v16qi)__M);
495
}
496
 
497
/// Returns a 128-bit vector of [8 x i16] where the values are selected
498
///    from either of the first or second operand as specified by the third
499
///    operand, the control mask.
500
///
501
/// \headerfile <x86intrin.h>
502
///
503
/// \code
504
/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
505
/// \endcode
506
///
507
/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
508
///
509
/// \param V1
510
///    A 128-bit vector of [8 x i16].
511
/// \param V2
512
///    A 128-bit vector of [8 x i16].
513
/// \param M
514
///    An immediate integer operand, with mask bits [7:0] specifying how the
515
///    values are to be copied. The position of the mask bit corresponds to the
516
///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
517
///    element in operand \a V1 is copied to the same position in the result.
518
///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
519
///    is copied to the same position in the result.
520
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
521
#define _mm_blend_epi16(V1, V2, M)                                             \
522
  ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
523
                                      (__v8hi)(__m128i)(V2), (int)(M)))
524
 
525
/* SSE4 Dword Multiply Instructions.  */
526
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
527
///    and returns the lower 32 bits of the each product in a 128-bit vector of
528
///    [4 x i32].
529
///
530
/// \headerfile <x86intrin.h>
531
///
532
/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
533
///
534
/// \param __V1
535
///    A 128-bit integer vector.
536
/// \param __V2
537
///    A 128-bit integer vector.
538
/// \returns A 128-bit integer vector containing the products of both operands.
539
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
540
                                                             __m128i __V2) {
541
  return (__m128i)((__v4su)__V1 * (__v4su)__V2);
542
}
543
 
544
/// Multiplies corresponding even-indexed elements of two 128-bit
545
///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
546
///    containing the products.
547
///
548
/// \headerfile <x86intrin.h>
549
///
550
/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
551
///
552
/// \param __V1
553
///    A 128-bit vector of [4 x i32].
554
/// \param __V2
555
///    A 128-bit vector of [4 x i32].
556
/// \returns A 128-bit vector of [2 x i64] containing the products of both
557
///    operands.
558
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
559
                                                           __m128i __V2) {
560
  return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
561
}
562
 
563
/* SSE4 Floating Point Dot Product Instructions.  */
564
/// Computes the dot product of the two 128-bit vectors of [4 x float]
565
///    and returns it in the elements of the 128-bit result vector of
566
///    [4 x float].
567
///
568
///    The immediate integer operand controls which input elements
569
///    will contribute to the dot product, and where the final results are
570
///    returned.
571
///
572
/// \headerfile <x86intrin.h>
573
///
574
/// \code
575
/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
576
/// \endcode
577
///
578
/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
579
///
580
/// \param X
581
///    A 128-bit vector of [4 x float].
582
/// \param Y
583
///    A 128-bit vector of [4 x float].
584
/// \param M
585
///    An immediate integer operand. Mask bits [7:4] determine which elements
586
///    of the input vectors are used, with bit [4] corresponding to the lowest
587
///    element and bit [7] corresponding to the highest element of each [4 x
588
///    float] vector. If a bit is set, the corresponding elements from the two
589
///    input vectors are used as an input for dot product; otherwise that input
590
///    is treated as zero. Bits [3:0] determine which elements of the result
591
///    will receive a copy of the final dot product, with bit [0] corresponding
592
///    to the lowest element and bit [3] corresponding to the highest element of
593
///    each [4 x float] subvector. If a bit is set, the dot product is returned
594
///    in the corresponding element; otherwise that element is set to zero.
595
/// \returns A 128-bit vector of [4 x float] containing the dot product.
596
#define _mm_dp_ps(X, Y, M)                                                     \
597
  ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
598
 
599
/// Computes the dot product of the two 128-bit vectors of [2 x double]
600
///    and returns it in the elements of the 128-bit result vector of
601
///    [2 x double].
602
///
603
///    The immediate integer operand controls which input
604
///    elements will contribute to the dot product, and where the final results
605
///    are returned.
606
///
607
/// \headerfile <x86intrin.h>
608
///
609
/// \code
610
/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
611
/// \endcode
612
///
613
/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
614
///
615
/// \param X
616
///    A 128-bit vector of [2 x double].
617
/// \param Y
618
///    A 128-bit vector of [2 x double].
619
/// \param M
620
///    An immediate integer operand. Mask bits [5:4] determine which elements
621
///    of the input vectors are used, with bit [4] corresponding to the lowest
622
///    element and bit [5] corresponding to the highest element of each of [2 x
623
///    double] vector. If a bit is set, the corresponding elements from the two
624
///    input vectors are used as an input for dot product; otherwise that input
625
///    is treated as zero. Bits [1:0] determine which elements of the result
626
///    will receive a copy of the final dot product, with bit [0] corresponding
627
///    to the lowest element and bit [1] corresponding to the highest element of
628
///    each [2 x double] vector. If a bit is set, the dot product is returned in
629
///    the corresponding element; otherwise that element is set to zero.
630
#define _mm_dp_pd(X, Y, M)                                                     \
631
  ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
632
                                (M)))
633
 
634
/* SSE4 Streaming Load Hint Instruction.  */
635
/// Loads integer values from a 128-bit aligned memory location to a
636
///    128-bit integer vector.
637
///
638
/// \headerfile <x86intrin.h>
639
///
640
/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
641
///
642
/// \param __V
643
///    A pointer to a 128-bit aligned memory location that contains the integer
644
///    values.
645
/// \returns A 128-bit integer vector containing the data stored at the
646
///    specified memory location.
647
static __inline__ __m128i __DEFAULT_FN_ATTRS
648
_mm_stream_load_si128(__m128i const *__V) {
649
  return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
650
}
651
 
652
/* SSE4 Packed Integer Min/Max Instructions.  */
653
/// Compares the corresponding elements of two 128-bit vectors of
654
///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
655
///    of the two values.
656
///
657
/// \headerfile <x86intrin.h>
658
///
659
/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
660
///
661
/// \param __V1
662
///    A 128-bit vector of [16 x i8].
663
/// \param __V2
664
///    A 128-bit vector of [16 x i8]
665
/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
666
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
667
                                                          __m128i __V2) {
668
  return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
669
}
670
 
671
/// Compares the corresponding elements of two 128-bit vectors of
672
///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
673
///    greater value of the two.
674
///
675
/// \headerfile <x86intrin.h>
676
///
677
/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
678
///
679
/// \param __V1
680
///    A 128-bit vector of [16 x i8].
681
/// \param __V2
682
///    A 128-bit vector of [16 x i8].
683
/// \returns A 128-bit vector of [16 x i8] containing the greater values.
684
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
685
                                                          __m128i __V2) {
686
  return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
687
}
688
 
689
/// Compares the corresponding elements of two 128-bit vectors of
690
///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
691
///    value of the two.
692
///
693
/// \headerfile <x86intrin.h>
694
///
695
/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
696
///
697
/// \param __V1
698
///    A 128-bit vector of [8 x u16].
699
/// \param __V2
700
///    A 128-bit vector of [8 x u16].
701
/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
702
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
703
                                                           __m128i __V2) {
704
  return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
705
}
706
 
707
/// Compares the corresponding elements of two 128-bit vectors of
708
///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
709
///    greater value of the two.
710
///
711
/// \headerfile <x86intrin.h>
712
///
713
/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
714
///
715
/// \param __V1
716
///    A 128-bit vector of [8 x u16].
717
/// \param __V2
718
///    A 128-bit vector of [8 x u16].
719
/// \returns A 128-bit vector of [8 x u16] containing the greater values.
720
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
721
                                                           __m128i __V2) {
722
  return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
723
}
724
 
725
/// Compares the corresponding elements of two 128-bit vectors of
726
///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
727
///    value of the two.
728
///
729
/// \headerfile <x86intrin.h>
730
///
731
/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
732
///
733
/// \param __V1
734
///    A 128-bit vector of [4 x i32].
735
/// \param __V2
736
///    A 128-bit vector of [4 x i32].
737
/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
738
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
739
                                                           __m128i __V2) {
740
  return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
741
}
742
 
743
/// Compares the corresponding elements of two 128-bit vectors of
744
///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
745
///    greater value of the two.
746
///
747
/// \headerfile <x86intrin.h>
748
///
749
/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
750
///
751
/// \param __V1
752
///    A 128-bit vector of [4 x i32].
753
/// \param __V2
754
///    A 128-bit vector of [4 x i32].
755
/// \returns A 128-bit vector of [4 x i32] containing the greater values.
756
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
757
                                                           __m128i __V2) {
758
  return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
759
}
760
 
761
/// Compares the corresponding elements of two 128-bit vectors of
762
///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
763
///    value of the two.
764
///
765
/// \headerfile <x86intrin.h>
766
///
767
/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
768
///
769
/// \param __V1
770
///    A 128-bit vector of [4 x u32].
771
/// \param __V2
772
///    A 128-bit vector of [4 x u32].
773
/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
774
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
775
                                                           __m128i __V2) {
776
  return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
777
}
778
 
779
/// Compares the corresponding elements of two 128-bit vectors of
780
///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
781
///    greater value of the two.
782
///
783
/// \headerfile <x86intrin.h>
784
///
785
/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
786
///
787
/// \param __V1
788
///    A 128-bit vector of [4 x u32].
789
/// \param __V2
790
///    A 128-bit vector of [4 x u32].
791
/// \returns A 128-bit vector of [4 x u32] containing the greater values.
792
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
793
                                                           __m128i __V2) {
794
  return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
795
}
796
 
797
/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
798
/// Takes the first argument \a X and inserts an element from the second
799
///    argument \a Y as selected by the third argument \a N. That result then
800
///    has elements zeroed out also as selected by the third argument \a N. The
801
///    resulting 128-bit vector of [4 x float] is then returned.
802
///
803
/// \headerfile <x86intrin.h>
804
///
805
/// \code
806
/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
807
/// \endcode
808
///
809
/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
810
///
811
/// \param X
812
///    A 128-bit vector source operand of [4 x float]. With the exception of
813
///    those bits in the result copied from parameter \a Y and zeroed by bits
814
///    [3:0] of \a N, all bits from this parameter are copied to the result.
815
/// \param Y
816
///    A 128-bit vector source operand of [4 x float]. One single-precision
817
///    floating-point element from this source, as determined by the immediate
818
///    parameter, is copied to the result.
819
/// \param N
820
///    Specifies which bits from operand \a Y will be copied, which bits in the
821
///    result they will be copied to, and which bits in the result will be
822
///    cleared. The following assignments are made: \n
823
///    Bits [7:6] specify the bits to copy from operand \a Y: \n
824
///      00: Selects bits [31:0] from operand \a Y. \n
825
///      01: Selects bits [63:32] from operand \a Y. \n
826
///      10: Selects bits [95:64] from operand \a Y. \n
827
///      11: Selects bits [127:96] from operand \a Y. \n
828
///    Bits [5:4] specify the bits in the result to which the selected bits
829
///    from operand \a Y are copied: \n
830
///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
831
///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
832
///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
833
///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
834
///    Bits[3:0]: If any of these bits are set, the corresponding result
835
///    element is cleared.
836
/// \returns A 128-bit vector of [4 x float] containing the copied
837
///    single-precision floating point elements from the operands.
838
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
839
 
840
/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
841
///    returns it, using the immediate value parameter \a N as a selector.
842
///
843
/// \headerfile <x86intrin.h>
844
///
845
/// \code
846
/// int _mm_extract_ps(__m128 X, const int N);
847
/// \endcode
848
///
849
/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
850
/// instruction.
851
///
852
/// \param X
853
///    A 128-bit vector of [4 x float].
854
/// \param N
855
///    An immediate value. Bits [1:0] determines which bits from the argument
856
///    \a X are extracted and returned: \n
857
///    00: Bits [31:0] of parameter \a X are returned. \n
858
///    01: Bits [63:32] of parameter \a X are returned. \n
859
///    10: Bits [95:64] of parameter \a X are returned. \n
860
///    11: Bits [127:96] of parameter \a X are returned.
861
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
862
#define _mm_extract_ps(X, N)                                                   \
863
  __builtin_bit_cast(                                                          \
864
      int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
865
 
866
/* Miscellaneous insert and extract macros.  */
867
/* Extract a single-precision float from X at index N into D.  */
868
#define _MM_EXTRACT_FLOAT(D, X, N)                                             \
869
  do {                                                                         \
870
    (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
871
  } while (0)
872
 
873
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
874
   an index suitable for _mm_insert_ps.  */
875
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
876
 
877
/* Extract a float from X at index N into the first index of the return.  */
878
#define _MM_PICK_OUT_PS(X, N)                                                  \
879
  _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
880
 
881
/* Insert int into packed integer array at index.  */
882
/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
883
///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
884
///    of an integer parameter \a I into an offset specified by the immediate
885
///    value parameter \a N.
886
///
887
/// \headerfile <x86intrin.h>
888
///
889
/// \code
890
/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
891
/// \endcode
892
///
893
/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
894
///
895
/// \param X
896
///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
897
///    result and then one of the sixteen elements in the result vector is
898
///    replaced by the lower 8 bits of \a I.
899
/// \param I
900
///    An integer. The lower 8 bits of this operand are written to the result
901
///    beginning at the offset specified by \a N.
902
/// \param N
903
///    An immediate value. Bits [3:0] specify the bit offset in the result at
904
///    which the lower 8 bits of \a I are written. \n
905
///    0000: Bits [7:0] of the result are used for insertion. \n
906
///    0001: Bits [15:8] of the result are used for insertion. \n
907
///    0010: Bits [23:16] of the result are used for insertion. \n
908
///    0011: Bits [31:24] of the result are used for insertion. \n
909
///    0100: Bits [39:32] of the result are used for insertion. \n
910
///    0101: Bits [47:40] of the result are used for insertion. \n
911
///    0110: Bits [55:48] of the result are used for insertion. \n
912
///    0111: Bits [63:56] of the result are used for insertion. \n
913
///    1000: Bits [71:64] of the result are used for insertion. \n
914
///    1001: Bits [79:72] of the result are used for insertion. \n
915
///    1010: Bits [87:80] of the result are used for insertion. \n
916
///    1011: Bits [95:88] of the result are used for insertion. \n
917
///    1100: Bits [103:96] of the result are used for insertion. \n
918
///    1101: Bits [111:104] of the result are used for insertion. \n
919
///    1110: Bits [119:112] of the result are used for insertion. \n
920
///    1111: Bits [127:120] of the result are used for insertion.
921
/// \returns A 128-bit integer vector containing the constructed values.
922
#define _mm_insert_epi8(X, I, N)                                               \
923
  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
924
                                         (int)(N)))
925
 
926
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
927
///    the 128-bit integer vector parameter, and then inserting the 32-bit
928
///    integer parameter \a I at the offset specified by the immediate value
929
///    parameter \a N.
930
///
931
/// \headerfile <x86intrin.h>
932
///
933
/// \code
934
/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
935
/// \endcode
936
///
937
/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
938
///
939
/// \param X
940
///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
941
///    result and then one of the four elements in the result vector is
942
///    replaced by \a I.
943
/// \param I
944
///    A 32-bit integer that is written to the result beginning at the offset
945
///    specified by \a N.
946
/// \param N
947
///    An immediate value. Bits [1:0] specify the bit offset in the result at
948
///    which the integer \a I is written. \n
949
///    00: Bits [31:0] of the result are used for insertion. \n
950
///    01: Bits [63:32] of the result are used for insertion. \n
951
///    10: Bits [95:64] of the result are used for insertion. \n
952
///    11: Bits [127:96] of the result are used for insertion.
953
/// \returns A 128-bit integer vector containing the constructed values.
954
#define _mm_insert_epi32(X, I, N)                                              \
955
  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
956
                                        (int)(N)))
957
 
958
#ifdef __x86_64__
959
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
960
///    the 128-bit integer vector parameter, and then inserting the 64-bit
961
///    integer parameter \a I, using the immediate value parameter \a N as an
962
///    insertion location selector.
963
///
964
/// \headerfile <x86intrin.h>
965
///
966
/// \code
967
/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
968
/// \endcode
969
///
970
/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
971
///
972
/// \param X
973
///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
974
///    result and then one of the two elements in the result vector is replaced
975
///    by \a I.
976
/// \param I
977
///    A 64-bit integer that is written to the result beginning at the offset
978
///    specified by \a N.
979
/// \param N
980
///    An immediate value. Bit [0] specifies the bit offset in the result at
981
///    which the integer \a I is written. \n
982
///    0: Bits [63:0] of the result are used for insertion. \n
983
///    1: Bits [127:64] of the result are used for insertion. \n
984
/// \returns A 128-bit integer vector containing the constructed values.
985
#define _mm_insert_epi64(X, I, N)                                              \
986
  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
987
                                        (int)(N)))
988
#endif /* __x86_64__ */
989
 
990
/* Extract int from packed integer array at index.  This returns the element
991
 * as a zero extended value, so it is unsigned.
992
 */
993
/// Extracts an 8-bit element from the 128-bit integer vector of
994
///    [16 x i8], using the immediate value parameter \a N as a selector.
995
///
996
/// \headerfile <x86intrin.h>
997
///
998
/// \code
999
/// int _mm_extract_epi8(__m128i X, const int N);
1000
/// \endcode
1001
///
1002
/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1003
///
1004
/// \param X
1005
///    A 128-bit integer vector.
1006
/// \param N
1007
///    An immediate value. Bits [3:0] specify which 8-bit vector element from
1008
///    the argument \a X to extract and copy to the result. \n
1009
///    0000: Bits [7:0] of parameter \a X are extracted. \n
1010
///    0001: Bits [15:8] of the parameter \a X are extracted. \n
1011
///    0010: Bits [23:16] of the parameter \a X are extracted. \n
1012
///    0011: Bits [31:24] of the parameter \a X are extracted. \n
1013
///    0100: Bits [39:32] of the parameter \a X are extracted. \n
1014
///    0101: Bits [47:40] of the parameter \a X are extracted. \n
1015
///    0110: Bits [55:48] of the parameter \a X are extracted. \n
1016
///    0111: Bits [63:56] of the parameter \a X are extracted. \n
1017
///    1000: Bits [71:64] of the parameter \a X are extracted. \n
1018
///    1001: Bits [79:72] of the parameter \a X are extracted. \n
1019
///    1010: Bits [87:80] of the parameter \a X are extracted. \n
1020
///    1011: Bits [95:88] of the parameter \a X are extracted. \n
1021
///    1100: Bits [103:96] of the parameter \a X are extracted. \n
1022
///    1101: Bits [111:104] of the parameter \a X are extracted. \n
1023
///    1110: Bits [119:112] of the parameter \a X are extracted. \n
1024
///    1111: Bits [127:120] of the parameter \a X are extracted.
1025
/// \returns  An unsigned integer, whose lower 8 bits are selected from the
1026
///    128-bit integer vector parameter and the remaining bits are assigned
1027
///    zeros.
1028
#define _mm_extract_epi8(X, N)                                                 \
1029
  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
1030
                                                    (int)(N)))
1031
 
1032
/// Extracts a 32-bit element from the 128-bit integer vector of
1033
///    [4 x i32], using the immediate value parameter \a N as a selector.
1034
///
1035
/// \headerfile <x86intrin.h>
1036
///
1037
/// \code
1038
/// int _mm_extract_epi32(__m128i X, const int N);
1039
/// \endcode
1040
///
1041
/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1042
///
1043
/// \param X
1044
///    A 128-bit integer vector.
1045
/// \param N
1046
///    An immediate value. Bits [1:0] specify which 32-bit vector element from
1047
///    the argument \a X to extract and copy to the result. \n
1048
///    00: Bits [31:0] of the parameter \a X are extracted. \n
1049
///    01: Bits [63:32] of the parameter \a X are extracted. \n
1050
///    10: Bits [95:64] of the parameter \a X are extracted. \n
1051
///    11: Bits [127:96] of the parameter \a X are exracted.
1052
/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
1053
///    integer vector parameter and the remaining bits are assigned zeros.
1054
#define _mm_extract_epi32(X, N)                                                \
1055
  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1056
 
1057
/// Extracts a 64-bit element from the 128-bit integer vector of
1058
///    [2 x i64], using the immediate value parameter \a N as a selector.
1059
///
1060
/// \headerfile <x86intrin.h>
1061
///
1062
/// \code
1063
/// long long _mm_extract_epi64(__m128i X, const int N);
1064
/// \endcode
1065
///
1066
/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1067
/// in 64-bit mode.
1068
///
1069
/// \param X
1070
///    A 128-bit integer vector.
1071
/// \param N
1072
///    An immediate value. Bit [0] specifies which 64-bit vector element from
1073
///    the argument \a X to return. \n
1074
///    0: Bits [63:0] are returned. \n
1075
///    1: Bits [127:64] are returned. \n
1076
/// \returns  A 64-bit integer.
1077
#define _mm_extract_epi64(X, N)                                                \
1078
  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1079
 
1080
/* SSE4 128-bit Packed Integer Comparisons.  */
1081
/// Tests whether the specified bits in a 128-bit integer vector are all
1082
///    zeros.
1083
///
1084
/// \headerfile <x86intrin.h>
1085
///
1086
/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1087
///
1088
/// \param __M
1089
///    A 128-bit integer vector containing the bits to be tested.
1090
/// \param __V
1091
///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1092
/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1093
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1094
                                                         __m128i __V) {
1095
  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1096
}
1097
 
1098
/// Tests whether the specified bits in a 128-bit integer vector are all
1099
///    ones.
1100
///
1101
/// \headerfile <x86intrin.h>
1102
///
1103
/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1104
///
1105
/// \param __M
1106
///    A 128-bit integer vector containing the bits to be tested.
1107
/// \param __V
1108
///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1109
/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1110
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1111
                                                         __m128i __V) {
1112
  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1113
}
1114
 
1115
/// Tests whether the specified bits in a 128-bit integer vector are
1116
///    neither all zeros nor all ones.
1117
///
1118
/// \headerfile <x86intrin.h>
1119
///
1120
/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1121
///
1122
/// \param __M
1123
///    A 128-bit integer vector containing the bits to be tested.
1124
/// \param __V
1125
///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1126
/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1127
///    FALSE otherwise.
1128
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1129
                                                           __m128i __V) {
1130
  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1131
}
1132
 
1133
/// Tests whether the specified bits in a 128-bit integer vector are all
1134
///    ones.
1135
///
1136
/// \headerfile <x86intrin.h>
1137
///
1138
/// \code
1139
/// int _mm_test_all_ones(__m128i V);
1140
/// \endcode
1141
///
1142
/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1143
///
1144
/// \param V
1145
///    A 128-bit integer vector containing the bits to be tested.
1146
/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1147
///    otherwise.
1148
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1149
 
1150
/// Tests whether the specified bits in a 128-bit integer vector are
1151
///    neither all zeros nor all ones.
1152
///
1153
/// \headerfile <x86intrin.h>
1154
///
1155
/// \code
1156
/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1157
/// \endcode
1158
///
1159
/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1160
///
1161
/// \param M
1162
///    A 128-bit integer vector containing the bits to be tested.
1163
/// \param V
1164
///    A 128-bit integer vector selecting which bits to test in operand \a M.
1165
/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1166
///    FALSE otherwise.
1167
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1168
 
1169
/// Tests whether the specified bits in a 128-bit integer vector are all
1170
///    zeros.
1171
///
1172
/// \headerfile <x86intrin.h>
1173
///
1174
/// \code
1175
/// int _mm_test_all_zeros(__m128i M, __m128i V);
1176
/// \endcode
1177
///
1178
/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1179
///
1180
/// \param M
1181
///    A 128-bit integer vector containing the bits to be tested.
1182
/// \param V
1183
///    A 128-bit integer vector selecting which bits to test in operand \a M.
1184
/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1185
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1186
 
1187
/* SSE4 64-bit Packed Integer Comparisons.  */
1188
/// Compares each of the corresponding 64-bit values of the 128-bit
1189
///    integer vectors for equality.
1190
///
1191
/// \headerfile <x86intrin.h>
1192
///
1193
/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1194
///
1195
/// \param __V1
1196
///    A 128-bit integer vector.
1197
/// \param __V2
1198
///    A 128-bit integer vector.
1199
/// \returns A 128-bit integer vector containing the comparison results.
1200
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
1201
                                                             __m128i __V2) {
1202
  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1203
}
1204
 
1205
/* SSE4 Packed Integer Sign-Extension.  */
1206
/// Sign-extends each of the lower eight 8-bit integer elements of a
1207
///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
1208
///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
1209
///    are unused.
1210
///
1211
/// \headerfile <x86intrin.h>
1212
///
1213
/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1214
///
1215
/// \param __V
1216
///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1217
///    sign-extended to 16-bit values.
1218
/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1219
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
1220
  /* This function always performs a signed extension, but __v16qi is a char
1221
     which may be signed or unsigned, so use __v16qs. */
1222
  return (__m128i) __builtin_convertvector(
1223
      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1224
                              7),
1225
      __v8hi);
1226
}
1227
 
1228
/// Sign-extends each of the lower four 8-bit integer elements of a
1229
///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
1230
///    128-bit vector of [4 x i32]. The upper twelve elements of the input
1231
///    vector are unused.
1232
///
1233
/// \headerfile <x86intrin.h>
1234
///
1235
/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1236
///
1237
/// \param __V
1238
///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1239
///    sign-extended to 32-bit values.
1240
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1241
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
1242
  /* This function always performs a signed extension, but __v16qi is a char
1243
     which may be signed or unsigned, so use __v16qs. */
1244
  return (__m128i) __builtin_convertvector(
1245
      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1246
}
1247
 
1248
/// Sign-extends each of the lower two 8-bit integer elements of a
1249
///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1250
///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1251
///    vector are unused.
1252
///
1253
/// \headerfile <x86intrin.h>
1254
///
1255
/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1256
///
1257
/// \param __V
1258
///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1259
///    sign-extended to 64-bit values.
1260
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1261
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
1262
  /* This function always performs a signed extension, but __v16qi is a char
1263
     which may be signed or unsigned, so use __v16qs. */
1264
  return (__m128i) __builtin_convertvector(
1265
      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1266
}
1267
 
1268
/// Sign-extends each of the lower four 16-bit integer elements of a
1269
///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1270
///    a 128-bit vector of [4 x i32]. The upper four elements of the input
1271
///    vector are unused.
1272
///
1273
/// \headerfile <x86intrin.h>
1274
///
1275
/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1276
///
1277
/// \param __V
1278
///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1279
///    sign-extended to 32-bit values.
1280
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1281
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
1282
  return (__m128i) __builtin_convertvector(
1283
      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1284
}
1285
 
1286
/// Sign-extends each of the lower two 16-bit integer elements of a
1287
///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1288
///    a 128-bit vector of [2 x i64]. The upper six elements of the input
1289
///    vector are unused.
1290
///
1291
/// \headerfile <x86intrin.h>
1292
///
1293
/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1294
///
1295
/// \param __V
1296
///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1297
///     sign-extended to 64-bit values.
1298
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1299
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
1300
  return (__m128i) __builtin_convertvector(
1301
      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1302
}
1303
 
1304
/// Sign-extends each of the lower two 32-bit integer elements of a
1305
///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1306
///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1307
///    are unused.
1308
///
1309
/// \headerfile <x86intrin.h>
1310
///
1311
/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1312
///
1313
/// \param __V
1314
///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1315
///    sign-extended to 64-bit values.
1316
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1317
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
1318
  return (__m128i) __builtin_convertvector(
1319
      __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1320
}
1321
 
1322
/* SSE4 Packed Integer Zero-Extension.  */
1323
/// Zero-extends each of the lower eight 8-bit integer elements of a
1324
///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
1325
///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
1326
///    are unused.
1327
///
1328
/// \headerfile <x86intrin.h>
1329
///
1330
/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1331
///
1332
/// \param __V
1333
///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1334
///    zero-extended to 16-bit values.
1335
/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1336
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
1337
  return (__m128i) __builtin_convertvector(
1338
      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1339
                              7),
1340
      __v8hi);
1341
}
1342
 
1343
/// Zero-extends each of the lower four 8-bit integer elements of a
1344
///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
1345
///    128-bit vector of [4 x i32]. The upper twelve elements of the input
1346
///    vector are unused.
1347
///
1348
/// \headerfile <x86intrin.h>
1349
///
1350
/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1351
///
1352
/// \param __V
1353
///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1354
///    zero-extended to 32-bit values.
1355
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1356
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
1357
  return (__m128i) __builtin_convertvector(
1358
      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1359
}
1360
 
1361
/// Zero-extends each of the lower two 8-bit integer elements of a
1362
///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1363
///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1364
///    vector are unused.
1365
///
1366
/// \headerfile <x86intrin.h>
1367
///
1368
/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1369
///
1370
/// \param __V
1371
///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1372
///    zero-extended to 64-bit values.
1373
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1374
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
1375
  return (__m128i) __builtin_convertvector(
1376
      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1377
}
1378
 
1379
/// Zero-extends each of the lower four 16-bit integer elements of a
1380
///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1381
///    a 128-bit vector of [4 x i32]. The upper four elements of the input
1382
///    vector are unused.
1383
///
1384
/// \headerfile <x86intrin.h>
1385
///
1386
/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1387
///
1388
/// \param __V
1389
///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1390
///    zero-extended to 32-bit values.
1391
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1392
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
1393
  return (__m128i) __builtin_convertvector(
1394
      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1395
}
1396
 
1397
/// Zero-extends each of the lower two 16-bit integer elements of a
1398
///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1399
///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1400
///    are unused.
1401
///
1402
/// \headerfile <x86intrin.h>
1403
///
1404
/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1405
///
1406
/// \param __V
1407
///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1408
///    zero-extended to 64-bit values.
1409
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1410
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
1411
  return (__m128i) __builtin_convertvector(
1412
      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1413
}
1414
 
1415
/// Zero-extends each of the lower two 32-bit integer elements of a
1416
///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1417
///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1418
///    are unused.
1419
///
1420
/// \headerfile <x86intrin.h>
1421
///
1422
/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1423
///
1424
/// \param __V
1425
///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1426
///    zero-extended to 64-bit values.
1427
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1428
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
1429
  return (__m128i) __builtin_convertvector(
1430
      __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1431
}
1432
 
1433
/* SSE4 Pack with Unsigned Saturation.  */
1434
/// Converts 32-bit signed integers from both 128-bit integer vector
1435
///    operands into 16-bit unsigned integers, and returns the packed result.
1436
///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1437
///    0x0000 are saturated to 0x0000.
1438
///
1439
/// \headerfile <x86intrin.h>
1440
///
1441
/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1442
///
1443
/// \param __V1
1444
///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1445
///    signed integer and is converted to a 16-bit unsigned integer with
1446
///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1447
///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1448
///    are written to the lower 64 bits of the result.
1449
/// \param __V2
1450
///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
1451
///    signed integer and is converted to a 16-bit unsigned integer with
1452
///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
1453
///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
1454
///    are written to the higher 64 bits of the result.
1455
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1456
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
1457
                                                              __m128i __V2) {
1458
  return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1459
}
1460
 
1461
/* SSE4 Multiple Packed Sums of Absolute Difference.  */
1462
/// Subtracts 8-bit unsigned integer values and computes the absolute
1463
///    values of the differences to the corresponding bits in the destination.
1464
///    Then sums of the absolute differences are returned according to the bit
1465
///    fields in the immediate operand.
1466
///
1467
/// \headerfile <x86intrin.h>
1468
///
1469
/// \code
1470
/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1471
/// \endcode
1472
///
1473
/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1474
///
1475
/// \param X
1476
///    A 128-bit vector of [16 x i8].
1477
/// \param Y
1478
///    A 128-bit vector of [16 x i8].
1479
/// \param M
1480
///    An 8-bit immediate operand specifying how the absolute differences are to
1481
///    be calculated, according to the following algorithm:
1482
///    \code
1483
///    // M2 represents bit 2 of the immediate operand
1484
///    // M10 represents bits [1:0] of the immediate operand
1485
///    i = M2 * 4;
1486
///    j = M10 * 4;
1487
///    for (k = 0; k < 8; k = k + 1) {
1488
///      d0 = abs(X[i + k + 0] - Y[j + 0]);
1489
///      d1 = abs(X[i + k + 1] - Y[j + 1]);
1490
///      d2 = abs(X[i + k + 2] - Y[j + 2]);
1491
///      d3 = abs(X[i + k + 3] - Y[j + 3]);
1492
///      r[k] = d0 + d1 + d2 + d3;
1493
///    }
1494
///    \endcode
1495
/// \returns A 128-bit integer vector containing the sums of the sets of
1496
///    absolute differences between both operands.
1497
#define _mm_mpsadbw_epu8(X, Y, M)                                              \
1498
  ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
1499
                                      (__v16qi)(__m128i)(Y), (M)))
1500
 
1501
/// Finds the minimum unsigned 16-bit element in the input 128-bit
1502
///    vector of [8 x u16] and returns it and along with its index.
1503
///
1504
/// \headerfile <x86intrin.h>
1505
///
1506
/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1507
/// instruction.
1508
///
1509
/// \param __V
1510
///    A 128-bit vector of [8 x u16].
1511
/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1512
///    in parameter \a __V, bits [18:16] contain the index of the minimum value
1513
///    and the remaining bits are set to 0.
1514
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1515
  return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1516
}
1517
 
1518
/* Handle the sse4.2 definitions here. */
1519
 
1520
/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1521
   so we'll do the same.  */
1522
 
1523
#undef __DEFAULT_FN_ATTRS
1524
#define __DEFAULT_FN_ATTRS                                                     \
1525
  __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1526
 
1527
/* These specify the type of data that we're comparing.  */
1528
#define _SIDD_UBYTE_OPS 0x00
1529
#define _SIDD_UWORD_OPS 0x01
1530
#define _SIDD_SBYTE_OPS 0x02
1531
#define _SIDD_SWORD_OPS 0x03
1532
 
1533
/* These specify the type of comparison operation.  */
1534
#define _SIDD_CMP_EQUAL_ANY 0x00
1535
#define _SIDD_CMP_RANGES 0x04
1536
#define _SIDD_CMP_EQUAL_EACH 0x08
1537
#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1538
 
1539
/* These macros specify the polarity of the operation.  */
1540
#define _SIDD_POSITIVE_POLARITY 0x00
1541
#define _SIDD_NEGATIVE_POLARITY 0x10
1542
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1543
#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1544
 
1545
/* These macros are used in _mm_cmpXstri() to specify the return.  */
1546
#define _SIDD_LEAST_SIGNIFICANT 0x00
1547
#define _SIDD_MOST_SIGNIFICANT 0x40
1548
 
1549
/* These macros are used in _mm_cmpXstri() to specify the return.  */
1550
#define _SIDD_BIT_MASK 0x00
1551
#define _SIDD_UNIT_MASK 0x40
1552
 
1553
/* SSE4.2 Packed Comparison Intrinsics.  */
1554
/// Uses the immediate operand \a M to perform a comparison of string
1555
///    data with implicitly defined lengths that is contained in source operands
1556
///    \a A and \a B. Returns a 128-bit integer vector representing the result
1557
///    mask of the comparison.
1558
///
1559
/// \headerfile <x86intrin.h>
1560
///
1561
/// \code
1562
/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1563
/// \endcode
1564
///
1565
/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1566
/// instruction.
1567
///
1568
/// \param A
1569
///    A 128-bit integer vector containing one of the source operands to be
1570
///    compared.
1571
/// \param B
1572
///    A 128-bit integer vector containing one of the source operands to be
1573
///    compared.
1574
/// \param M
1575
///    An 8-bit immediate operand specifying whether the characters are bytes or
1576
///    words, the type of comparison to perform, and the format of the return
1577
///    value. \n
1578
///    Bits [1:0]: Determine source data format. \n
1579
///      00: 16 unsigned bytes \n
1580
///      01: 8 unsigned words \n
1581
///      10: 16 signed bytes \n
1582
///      11: 8 signed words \n
1583
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1584
///      00: Subset: Each character in \a B is compared for equality with all
1585
///          the characters in \a A. \n
1586
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1587
///          basis is greater than or equal for even-indexed elements in \a A,
1588
///          and less than or equal for odd-indexed elements in \a A. \n
1589
///      10: Match: Compare each pair of corresponding characters in \a A and
1590
///          \a B for equality. \n
1591
///      11: Substring: Search \a B for substring matches of \a A. \n
1592
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1593
///                mask of the comparison results. \n
1594
///      00: No effect. \n
1595
///      01: Negate the bit mask. \n
1596
///      10: No effect. \n
1597
///      11: Negate the bit mask only for bits with an index less than or equal
1598
///          to the size of \a A or \a B. \n
1599
///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
1600
///             bytes. \n
1601
///      0: The result is zero-extended to 16 bytes. \n
1602
///      1: The result is expanded to 16 bytes (this expansion is performed by
1603
///         repeating each bit 8 or 16 times).
1604
/// \returns Returns a 128-bit integer vector representing the result mask of
1605
///    the comparison.
1606
#define _mm_cmpistrm(A, B, M)                                                  \
1607
  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
1608
                                        (__v16qi)(__m128i)(B), (int)(M)))
1609
 
1610
/// Uses the immediate operand \a M to perform a comparison of string
1611
///    data with implicitly defined lengths that is contained in source operands
1612
///    \a A and \a B. Returns an integer representing the result index of the
1613
///    comparison.
1614
///
1615
/// \headerfile <x86intrin.h>
1616
///
1617
/// \code
1618
/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1619
/// \endcode
1620
///
1621
/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1622
/// instruction.
1623
///
1624
/// \param A
1625
///    A 128-bit integer vector containing one of the source operands to be
1626
///    compared.
1627
/// \param B
1628
///    A 128-bit integer vector containing one of the source operands to be
1629
///    compared.
1630
/// \param M
1631
///    An 8-bit immediate operand specifying whether the characters are bytes or
1632
///    words, the type of comparison to perform, and the format of the return
1633
///    value. \n
1634
///    Bits [1:0]: Determine source data format. \n
1635
///      00: 16 unsigned bytes \n
1636
///      01: 8 unsigned words \n
1637
///      10: 16 signed bytes \n
1638
///      11: 8 signed words \n
1639
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1640
///      00: Subset: Each character in \a B is compared for equality with all
1641
///          the characters in \a A. \n
1642
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1643
///          basis is greater than or equal for even-indexed elements in \a A,
1644
///          and less than or equal for odd-indexed elements in \a A. \n
1645
///      10: Match: Compare each pair of corresponding characters in \a A and
1646
///          \a B for equality. \n
1647
///      11: Substring: Search B for substring matches of \a A. \n
1648
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1649
///                mask of the comparison results. \n
1650
///      00: No effect. \n
1651
///      01: Negate the bit mask. \n
1652
///      10: No effect. \n
1653
///      11: Negate the bit mask only for bits with an index less than or equal
1654
///          to the size of \a A or \a B. \n
1655
///    Bit [6]: Determines whether the index of the lowest set bit or the
1656
///             highest set bit is returned. \n
1657
///      0: The index of the least significant set bit. \n
1658
///      1: The index of the most significant set bit. \n
1659
/// \returns Returns an integer representing the result index of the comparison.
1660
#define _mm_cmpistri(A, B, M)                                                  \
1661
  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
1662
                                    (__v16qi)(__m128i)(B), (int)(M)))
1663
 
1664
/// Uses the immediate operand \a M to perform a comparison of string
1665
///    data with explicitly defined lengths that is contained in source operands
1666
///    \a A and \a B. Returns a 128-bit integer vector representing the result
1667
///    mask of the comparison.
1668
///
1669
/// \headerfile <x86intrin.h>
1670
///
1671
/// \code
1672
/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1673
/// \endcode
1674
///
1675
/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1676
/// instruction.
1677
///
1678
/// \param A
1679
///    A 128-bit integer vector containing one of the source operands to be
1680
///    compared.
1681
/// \param LA
1682
///    An integer that specifies the length of the string in \a A.
1683
/// \param B
1684
///    A 128-bit integer vector containing one of the source operands to be
1685
///    compared.
1686
/// \param LB
1687
///    An integer that specifies the length of the string in \a B.
1688
/// \param M
1689
///    An 8-bit immediate operand specifying whether the characters are bytes or
1690
///    words, the type of comparison to perform, and the format of the return
1691
///    value. \n
1692
///    Bits [1:0]: Determine source data format. \n
1693
///      00: 16 unsigned bytes \n
1694
///      01: 8 unsigned words \n
1695
///      10: 16 signed bytes \n
1696
///      11: 8 signed words \n
1697
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1698
///      00: Subset: Each character in \a B is compared for equality with all
1699
///          the characters in \a A. \n
1700
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1701
///          basis is greater than or equal for even-indexed elements in \a A,
1702
///          and less than or equal for odd-indexed elements in \a A. \n
1703
///      10: Match: Compare each pair of corresponding characters in \a A and
1704
///          \a B for equality. \n
1705
///      11: Substring: Search \a B for substring matches of \a A. \n
1706
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1707
///                mask of the comparison results. \n
1708
///      00: No effect. \n
1709
///      01: Negate the bit mask. \n
1710
///      10: No effect. \n
1711
///      11: Negate the bit mask only for bits with an index less than or equal
1712
///          to the size of \a A or \a B. \n
1713
///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
1714
///             bytes. \n
1715
///      0: The result is zero-extended to 16 bytes. \n
1716
///      1: The result is expanded to 16 bytes (this expansion is performed by
1717
///         repeating each bit 8 or 16 times). \n
1718
/// \returns Returns a 128-bit integer vector representing the result mask of
1719
///    the comparison.
1720
#define _mm_cmpestrm(A, LA, B, LB, M)                                          \
1721
  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
1722
                                        (__v16qi)(__m128i)(B), (int)(LB),      \
1723
                                        (int)(M)))
1724
 
1725
/// Uses the immediate operand \a M to perform a comparison of string
1726
///    data with explicitly defined lengths that is contained in source operands
1727
///    \a A and \a B. Returns an integer representing the result index of the
1728
///    comparison.
1729
///
1730
/// \headerfile <x86intrin.h>
1731
///
1732
/// \code
1733
/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1734
/// \endcode
1735
///
1736
/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1737
/// instruction.
1738
///
1739
/// \param A
1740
///    A 128-bit integer vector containing one of the source operands to be
1741
///    compared.
1742
/// \param LA
1743
///    An integer that specifies the length of the string in \a A.
1744
/// \param B
1745
///    A 128-bit integer vector containing one of the source operands to be
1746
///    compared.
1747
/// \param LB
1748
///    An integer that specifies the length of the string in \a B.
1749
/// \param M
1750
///    An 8-bit immediate operand specifying whether the characters are bytes or
1751
///    words, the type of comparison to perform, and the format of the return
1752
///    value. \n
1753
///    Bits [1:0]: Determine source data format. \n
1754
///      00: 16 unsigned bytes \n
1755
///      01: 8 unsigned words \n
1756
///      10: 16 signed bytes \n
1757
///      11: 8 signed words \n
1758
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1759
///      00: Subset: Each character in \a B is compared for equality with all
1760
///          the characters in \a A. \n
1761
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1762
///          basis is greater than or equal for even-indexed elements in \a A,
1763
///          and less than or equal for odd-indexed elements in \a A. \n
1764
///      10: Match: Compare each pair of corresponding characters in \a A and
1765
///          \a B for equality. \n
1766
///      11: Substring: Search B for substring matches of \a A. \n
1767
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1768
///                mask of the comparison results. \n
1769
///      00: No effect. \n
1770
///      01: Negate the bit mask. \n
1771
///      10: No effect. \n
1772
///      11: Negate the bit mask only for bits with an index less than or equal
1773
///          to the size of \a A or \a B. \n
1774
///    Bit [6]: Determines whether the index of the lowest set bit or the
1775
///             highest set bit is returned. \n
1776
///      0: The index of the least significant set bit. \n
1777
///      1: The index of the most significant set bit. \n
1778
/// \returns Returns an integer representing the result index of the comparison.
1779
#define _mm_cmpestri(A, LA, B, LB, M)                                          \
1780
  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
1781
                                    (__v16qi)(__m128i)(B), (int)(LB),          \
1782
                                    (int)(M)))
1783
 
1784
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
1785
/// Uses the immediate operand \a M to perform a comparison of string
1786
///    data with implicitly defined lengths that is contained in source operands
1787
///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1788
///    string in \a B is the maximum, otherwise, returns 0.
1789
///
1790
/// \headerfile <x86intrin.h>
1791
///
1792
/// \code
1793
/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1794
/// \endcode
1795
///
1796
/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1797
/// instruction.
1798
///
1799
/// \param A
1800
///    A 128-bit integer vector containing one of the source operands to be
1801
///    compared.
1802
/// \param B
1803
///    A 128-bit integer vector containing one of the source operands to be
1804
///    compared.
1805
/// \param M
1806
///    An 8-bit immediate operand specifying whether the characters are bytes or
1807
///    words and the type of comparison to perform. \n
1808
///    Bits [1:0]: Determine source data format. \n
1809
///      00: 16 unsigned bytes \n
1810
///      01: 8 unsigned words \n
1811
///      10: 16 signed bytes \n
1812
///      11: 8 signed words \n
1813
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1814
///      00: Subset: Each character in \a B is compared for equality with all
1815
///          the characters in \a A. \n
1816
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1817
///          basis is greater than or equal for even-indexed elements in \a A,
1818
///          and less than or equal for odd-indexed elements in \a A. \n
1819
///      10: Match: Compare each pair of corresponding characters in \a A and
1820
///          \a B for equality. \n
1821
///      11: Substring: Search \a B for substring matches of \a A. \n
1822
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1823
///                mask of the comparison results. \n
1824
///      00: No effect. \n
1825
///      01: Negate the bit mask. \n
1826
///      10: No effect. \n
1827
///      11: Negate the bit mask only for bits with an index less than or equal
1828
///          to the size of \a A or \a B. \n
1829
/// \returns Returns 1 if the bit mask is zero and the length of the string in
1830
///    \a B is the maximum; otherwise, returns 0.
1831
#define _mm_cmpistra(A, B, M)                                                  \
1832
  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
1833
                                     (__v16qi)(__m128i)(B), (int)(M)))
1834
 
1835
/// Uses the immediate operand \a M to perform a comparison of string
1836
///    data with implicitly defined lengths that is contained in source operands
1837
///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1838
///    0.
1839
///
1840
/// \headerfile <x86intrin.h>
1841
///
1842
/// \code
1843
/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1844
/// \endcode
1845
///
1846
/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1847
/// instruction.
1848
///
1849
/// \param A
1850
///    A 128-bit integer vector containing one of the source operands to be
1851
///    compared.
1852
/// \param B
1853
///    A 128-bit integer vector containing one of the source operands to be
1854
///    compared.
1855
/// \param M
1856
///    An 8-bit immediate operand specifying whether the characters are bytes or
1857
///    words and the type of comparison to perform. \n
1858
///    Bits [1:0]: Determine source data format. \n
1859
///      00: 16 unsigned bytes \n
1860
///      01: 8 unsigned words \n
1861
///      10: 16 signed bytes \n
1862
///      11: 8 signed words \n
1863
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1864
///      00: Subset: Each character in \a B is compared for equality with all
1865
///          the characters in \a A. \n
1866
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1867
///          basis is greater than or equal for even-indexed elements in \a A,
1868
///          and less than or equal for odd-indexed elements in \a A. \n
1869
///      10: Match: Compare each pair of corresponding characters in \a A and
1870
///          \a B for equality. \n
1871
///      11: Substring: Search B for substring matches of \a A. \n
1872
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1873
///                mask of the comparison results. \n
1874
///      00: No effect. \n
1875
///      01: Negate the bit mask. \n
1876
///      10: No effect. \n
1877
///      11: Negate the bit mask only for bits with an index less than or equal
1878
///          to the size of \a A or \a B.
1879
/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1880
#define _mm_cmpistrc(A, B, M)                                                  \
1881
  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
1882
                                     (__v16qi)(__m128i)(B), (int)(M)))
1883
 
1884
/// Uses the immediate operand \a M to perform a comparison of string
1885
///    data with implicitly defined lengths that is contained in source operands
1886
///    \a A and \a B. Returns bit 0 of the resulting bit mask.
1887
///
1888
/// \headerfile <x86intrin.h>
1889
///
1890
/// \code
1891
/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1892
/// \endcode
1893
///
1894
/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1895
/// instruction.
1896
///
1897
/// \param A
1898
///    A 128-bit integer vector containing one of the source operands to be
1899
///    compared.
1900
/// \param B
1901
///    A 128-bit integer vector containing one of the source operands to be
1902
///    compared.
1903
/// \param M
1904
///    An 8-bit immediate operand specifying whether the characters are bytes or
1905
///    words and the type of comparison to perform. \n
1906
///    Bits [1:0]: Determine source data format. \n
1907
///      00: 16 unsigned bytes \n
1908
///      01: 8 unsigned words \n
1909
///      10: 16 signed bytes \n
1910
///      11: 8 signed words \n
1911
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1912
///      00: Subset: Each character in \a B is compared for equality with all
1913
///          the characters in \a A. \n
1914
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1915
///          basis is greater than or equal for even-indexed elements in \a A,
1916
///          and less than or equal for odd-indexed elements in \a A. \n
1917
///      10: Match: Compare each pair of corresponding characters in \a A and
1918
///          \a B for equality. \n
1919
///      11: Substring: Search B for substring matches of \a A. \n
1920
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1921
///                mask of the comparison results. \n
1922
///      00: No effect. \n
1923
///      01: Negate the bit mask. \n
1924
///      10: No effect. \n
1925
///      11: Negate the bit mask only for bits with an index less than or equal
1926
///          to the size of \a A or \a B. \n
1927
/// \returns Returns bit 0 of the resulting bit mask.
1928
#define _mm_cmpistro(A, B, M)                                                  \
1929
  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
1930
                                     (__v16qi)(__m128i)(B), (int)(M)))
1931
 
1932
/// Uses the immediate operand \a M to perform a comparison of string
1933
///    data with implicitly defined lengths that is contained in source operands
1934
///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
1935
///    the maximum, otherwise, returns 0.
1936
///
1937
/// \headerfile <x86intrin.h>
1938
///
1939
/// \code
1940
/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1941
/// \endcode
1942
///
1943
/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1944
/// instruction.
1945
///
1946
/// \param A
1947
///    A 128-bit integer vector containing one of the source operands to be
1948
///    compared.
1949
/// \param B
1950
///    A 128-bit integer vector containing one of the source operands to be
1951
///    compared.
1952
/// \param M
1953
///    An 8-bit immediate operand specifying whether the characters are bytes or
1954
///    words and the type of comparison to perform. \n
1955
///    Bits [1:0]: Determine source data format. \n
1956
///      00: 16 unsigned bytes \n
1957
///      01: 8 unsigned words \n
1958
///      10: 16 signed bytes \n
1959
///      11: 8 signed words \n
1960
///    Bits [3:2]: Determine comparison type and aggregation method. \n
1961
///      00: Subset: Each character in \a B is compared for equality with all
1962
///          the characters in \a A. \n
1963
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1964
///          basis is greater than or equal for even-indexed elements in \a A,
1965
///          and less than or equal for odd-indexed elements in \a A. \n
1966
///      10: Match: Compare each pair of corresponding characters in \a A and
1967
///          \a B for equality. \n
1968
///      11: Substring: Search \a B for substring matches of \a A. \n
1969
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1970
///                mask of the comparison results. \n
1971
///      00: No effect. \n
1972
///      01: Negate the bit mask. \n
1973
///      10: No effect. \n
1974
///      11: Negate the bit mask only for bits with an index less than or equal
1975
///          to the size of \a A or \a B. \n
1976
/// \returns Returns 1 if the length of the string in \a A is less than the
1977
///    maximum, otherwise, returns 0.
1978
#define _mm_cmpistrs(A, B, M)                                                  \
1979
  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
1980
                                     (__v16qi)(__m128i)(B), (int)(M)))
1981
 
1982
/// Uses the immediate operand \a M to perform a comparison of string
1983
///    data with implicitly defined lengths that is contained in source operands
1984
///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
1985
///    the maximum, otherwise, returns 0.
1986
///
1987
/// \headerfile <x86intrin.h>
1988
///
1989
/// \code
1990
/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
1991
/// \endcode
1992
///
1993
/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1994
/// instruction.
1995
///
1996
/// \param A
1997
///    A 128-bit integer vector containing one of the source operands to be
1998
///    compared.
1999
/// \param B
2000
///    A 128-bit integer vector containing one of the source operands to be
2001
///    compared.
2002
/// \param M
2003
///    An 8-bit immediate operand specifying whether the characters are bytes or
2004
///    words and the type of comparison to perform. \n
2005
///    Bits [1:0]: Determine source data format. \n
2006
///      00: 16 unsigned bytes \n
2007
///      01: 8 unsigned words \n
2008
///      10: 16 signed bytes \n
2009
///      11: 8 signed words \n
2010
///    Bits [3:2]: Determine comparison type and aggregation method. \n
2011
///      00: Subset: Each character in \a B is compared for equality with all
2012
///          the characters in \a A. \n
2013
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2014
///          basis is greater than or equal for even-indexed elements in \a A,
2015
///          and less than or equal for odd-indexed elements in \a A. \n
2016
///      10: Match: Compare each pair of corresponding characters in \a A and
2017
///          \a B for equality. \n
2018
///      11: Substring: Search \a B for substring matches of \a A. \n
2019
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2020
///                mask of the comparison results. \n
2021
///      00: No effect. \n
2022
///      01: Negate the bit mask. \n
2023
///      10: No effect. \n
2024
///      11: Negate the bit mask only for bits with an index less than or equal
2025
///          to the size of \a A or \a B.
2026
/// \returns Returns 1 if the length of the string in \a B is less than the
2027
///    maximum, otherwise, returns 0.
2028
#define _mm_cmpistrz(A, B, M)                                                  \
2029
  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
2030
                                     (__v16qi)(__m128i)(B), (int)(M)))
2031
 
2032
/// Uses the immediate operand \a M to perform a comparison of string
2033
///    data with explicitly defined lengths that is contained in source operands
2034
///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2035
///    string in \a B is the maximum, otherwise, returns 0.
2036
///
2037
/// \headerfile <x86intrin.h>
2038
///
2039
/// \code
2040
/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2041
/// \endcode
2042
///
2043
/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2044
/// instruction.
2045
///
2046
/// \param A
2047
///    A 128-bit integer vector containing one of the source operands to be
2048
///    compared.
2049
/// \param LA
2050
///    An integer that specifies the length of the string in \a A.
2051
/// \param B
2052
///    A 128-bit integer vector containing one of the source operands to be
2053
///    compared.
2054
/// \param LB
2055
///    An integer that specifies the length of the string in \a B.
2056
/// \param M
2057
///    An 8-bit immediate operand specifying whether the characters are bytes or
2058
///    words and the type of comparison to perform. \n
2059
///    Bits [1:0]: Determine source data format. \n
2060
///      00: 16 unsigned bytes \n
2061
///      01: 8 unsigned words \n
2062
///      10: 16 signed bytes \n
2063
///      11: 8 signed words \n
2064
///    Bits [3:2]: Determine comparison type and aggregation method. \n
2065
///      00: Subset: Each character in \a B is compared for equality with all
2066
///          the characters in \a A. \n
2067
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2068
///          basis is greater than or equal for even-indexed elements in \a A,
2069
///          and less than or equal for odd-indexed elements in \a A. \n
2070
///      10: Match: Compare each pair of corresponding characters in \a A and
2071
///          \a B for equality. \n
2072
///      11: Substring: Search \a B for substring matches of \a A. \n
2073
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2074
///                mask of the comparison results. \n
2075
///      00: No effect. \n
2076
///      01: Negate the bit mask. \n
2077
///      10: No effect. \n
2078
///      11: Negate the bit mask only for bits with an index less than or equal
2079
///          to the size of \a A or \a B.
2080
/// \returns Returns 1 if the bit mask is zero and the length of the string in
2081
///    \a B is the maximum, otherwise, returns 0.
2082
#define _mm_cmpestra(A, LA, B, LB, M)                                          \
2083
  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
2084
                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2085
                                     (int)(M)))
2086
 
2087
/// Uses the immediate operand \a M to perform a comparison of string
2088
///    data with explicitly defined lengths that is contained in source operands
2089
///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2090
///    returns 0.
2091
///
2092
/// \headerfile <x86intrin.h>
2093
///
2094
/// \code
2095
/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2096
/// \endcode
2097
///
2098
/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2099
/// instruction.
2100
///
2101
/// \param A
2102
///    A 128-bit integer vector containing one of the source operands to be
2103
///    compared.
2104
/// \param LA
2105
///    An integer that specifies the length of the string in \a A.
2106
/// \param B
2107
///    A 128-bit integer vector containing one of the source operands to be
2108
///    compared.
2109
/// \param LB
2110
///    An integer that specifies the length of the string in \a B.
2111
/// \param M
2112
///    An 8-bit immediate operand specifying whether the characters are bytes or
2113
///    words and the type of comparison to perform. \n
2114
///    Bits [1:0]: Determine source data format. \n
2115
///      00: 16 unsigned bytes \n
2116
///      01: 8 unsigned words \n
2117
///      10: 16 signed bytes \n
2118
///      11: 8 signed words \n
2119
///    Bits [3:2]: Determine comparison type and aggregation method. \n
2120
///      00: Subset: Each character in \a B is compared for equality with all
2121
///          the characters in \a A. \n
2122
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2123
///          basis is greater than or equal for even-indexed elements in \a A,
2124
///          and less than or equal for odd-indexed elements in \a A. \n
2125
///      10: Match: Compare each pair of corresponding characters in \a A and
2126
///          \a B for equality. \n
2127
///      11: Substring: Search \a B for substring matches of \a A. \n
2128
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2129
///                mask of the comparison results. \n
2130
///      00: No effect. \n
2131
///      01: Negate the bit mask. \n
2132
///      10: No effect. \n
2133
///      11: Negate the bit mask only for bits with an index less than or equal
2134
///          to the size of \a A or \a B. \n
2135
/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2136
#define _mm_cmpestrc(A, LA, B, LB, M)                                          \
2137
  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
2138
                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2139
                                     (int)(M)))
2140
 
2141
/// Uses the immediate operand \a M to perform a comparison of string
2142
///    data with explicitly defined lengths that is contained in source operands
2143
///    \a A and \a B. Returns bit 0 of the resulting bit mask.
2144
///
2145
/// \headerfile <x86intrin.h>
2146
///
2147
/// \code
2148
/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2149
/// \endcode
2150
///
2151
/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2152
/// instruction.
2153
///
2154
/// \param A
2155
///    A 128-bit integer vector containing one of the source operands to be
2156
///    compared.
2157
/// \param LA
2158
///    An integer that specifies the length of the string in \a A.
2159
/// \param B
2160
///    A 128-bit integer vector containing one of the source operands to be
2161
///    compared.
2162
/// \param LB
2163
///    An integer that specifies the length of the string in \a B.
2164
/// \param M
2165
///    An 8-bit immediate operand specifying whether the characters are bytes or
2166
///    words and the type of comparison to perform. \n
2167
///    Bits [1:0]: Determine source data format. \n
2168
///      00: 16 unsigned bytes \n
2169
///      01: 8 unsigned words \n
2170
///      10: 16 signed bytes \n
2171
///      11: 8 signed words \n
2172
///    Bits [3:2]: Determine comparison type and aggregation method. \n
2173
///      00: Subset: Each character in \a B is compared for equality with all
2174
///          the characters in \a A. \n
2175
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2176
///          basis is greater than or equal for even-indexed elements in \a A,
2177
///          and less than or equal for odd-indexed elements in \a A. \n
2178
///      10: Match: Compare each pair of corresponding characters in \a A and
2179
///          \a B for equality. \n
2180
///      11: Substring: Search \a B for substring matches of \a A. \n
2181
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2182
///                mask of the comparison results. \n
2183
///      00: No effect. \n
2184
///      01: Negate the bit mask. \n
2185
///      10: No effect. \n
2186
///      11: Negate the bit mask only for bits with an index less than or equal
2187
///          to the size of \a A or \a B.
2188
/// \returns Returns bit 0 of the resulting bit mask.
2189
#define _mm_cmpestro(A, LA, B, LB, M)                                          \
2190
  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
2191
                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2192
                                     (int)(M)))
2193
 
2194
/// Uses the immediate operand \a M to perform a comparison of string
2195
///    data with explicitly defined lengths that is contained in source operands
2196
///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
2197
///    the maximum, otherwise, returns 0.
2198
///
2199
/// \headerfile <x86intrin.h>
2200
///
2201
/// \code
2202
/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2203
/// \endcode
2204
///
2205
/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2206
/// instruction.
2207
///
2208
/// \param A
2209
///    A 128-bit integer vector containing one of the source operands to be
2210
///    compared.
2211
/// \param LA
2212
///    An integer that specifies the length of the string in \a A.
2213
/// \param B
2214
///    A 128-bit integer vector containing one of the source operands to be
2215
///    compared.
2216
/// \param LB
2217
///    An integer that specifies the length of the string in \a B.
2218
/// \param M
2219
///    An 8-bit immediate operand specifying whether the characters are bytes or
2220
///    words and the type of comparison to perform. \n
2221
///    Bits [1:0]: Determine source data format. \n
2222
///      00: 16 unsigned bytes \n
2223
///      01: 8 unsigned words \n
2224
///      10: 16 signed bytes \n
2225
///      11: 8 signed words \n
2226
///    Bits [3:2]: Determine comparison type and aggregation method. \n
2227
///      00: Subset: Each character in \a B is compared for equality with all
2228
///          the characters in \a A. \n
2229
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2230
///          basis is greater than or equal for even-indexed elements in \a A,
2231
///          and less than or equal for odd-indexed elements in \a A. \n
2232
///      10: Match: Compare each pair of corresponding characters in \a A and
2233
///          \a B for equality. \n
2234
///      11: Substring: Search \a B for substring matches of \a A. \n
2235
///    Bits [5:4]: Determine whether to perform a one's complement in the bit
2236
///                mask of the comparison results. \n
2237
///      00: No effect. \n
2238
///      01: Negate the bit mask. \n
2239
///      10: No effect. \n
2240
///      11: Negate the bit mask only for bits with an index less than or equal
2241
///          to the size of \a A or \a B. \n
2242
/// \returns Returns 1 if the length of the string in \a A is less than the
2243
///    maximum, otherwise, returns 0.
2244
#define _mm_cmpestrs(A, LA, B, LB, M)                                          \
2245
  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
2246
                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2247
                                     (int)(M)))
2248
 
2249
/// Uses the immediate operand \a M to perform a comparison of string
2250
///    data with explicitly defined lengths that is contained in source operands
2251
///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
2252
///    the maximum, otherwise, returns 0.
2253
///
2254
/// \headerfile <x86intrin.h>
2255
///
2256
/// \code
2257
/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2258
/// \endcode
2259
///
2260
/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2261
///
2262
/// \param A
2263
///    A 128-bit integer vector containing one of the source operands to be
2264
///    compared.
2265
/// \param LA
2266
///    An integer that specifies the length of the string in \a A.
2267
/// \param B
2268
///    A 128-bit integer vector containing one of the source operands to be
2269
///    compared.
2270
/// \param LB
2271
///    An integer that specifies the length of the string in \a B.
2272
/// \param M
2273
///    An 8-bit immediate operand specifying whether the characters are bytes or
2274
///    words and the type of comparison to perform. \n
2275
///    Bits [1:0]: Determine source data format. \n
2276
///      00: 16 unsigned bytes  \n
2277
///      01: 8 unsigned words \n
2278
///      10: 16 signed bytes \n
2279
///      11: 8 signed words \n
2280
///    Bits [3:2]: Determine comparison type and aggregation method. \n
2281
///      00: Subset: Each character in \a B is compared for equality with all
2282
///          the characters in \a A. \n
2283
///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2284
///          basis is greater than or equal for even-indexed elements in \a A,
2285
///          and less than or equal for odd-indexed elements in \a A. \n
2286
///      10: Match: Compare each pair of corresponding characters in \a A and
2287
///          \a B for equality. \n
2288
///      11: Substring: Search \a B for substring matches of \a A. \n
2289
///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2290
///                mask of the comparison results. \n
2291
///      00: No effect. \n
2292
///      01: Negate the bit mask. \n
2293
///      10: No effect. \n
2294
///      11: Negate the bit mask only for bits with an index less than or equal
2295
///          to the size of \a A or \a B.
2296
/// \returns Returns 1 if the length of the string in \a B is less than the
2297
///    maximum, otherwise, returns 0.
2298
#define _mm_cmpestrz(A, LA, B, LB, M)                                          \
2299
  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
2300
                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2301
                                     (int)(M)))
2302
 
2303
/* SSE4.2 Compare Packed Data -- Greater Than.  */
2304
/// Compares each of the corresponding 64-bit values of the 128-bit
2305
///    integer vectors to determine if the values in the first operand are
2306
///    greater than those in the second operand.
2307
///
2308
/// \headerfile <x86intrin.h>
2309
///
2310
/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2311
///
2312
/// \param __V1
2313
///    A 128-bit integer vector.
2314
/// \param __V2
2315
///    A 128-bit integer vector.
2316
/// \returns A 128-bit integer vector containing the comparison results.
2317
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
2318
                                                             __m128i __V2) {
2319
  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2320
}
2321
 
2322
#undef __DEFAULT_FN_ATTRS
2323
 
2324
#include <popcntintrin.h>
2325
 
2326
#include <crc32intrin.h>
2327
 
2328
#endif /* __SMMINTRIN_H */