Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __EMMINTRIN_H
  11. #define __EMMINTRIN_H
  12.  
  13. #if !defined(__i386__) && !defined(__x86_64__)
  14. #error "This header is only meant to be used on x86 and x64 architecture"
  15. #endif
  16.  
  17. #include <xmmintrin.h>
  18.  
  19. typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
  20. typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
  21.  
  22. typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
  23. typedef long long __m128i_u
  24.     __attribute__((__vector_size__(16), __aligned__(1)));
  25.  
  26. /* Type defines.  */
  27. typedef double __v2df __attribute__((__vector_size__(16)));
  28. typedef long long __v2di __attribute__((__vector_size__(16)));
  29. typedef short __v8hi __attribute__((__vector_size__(16)));
  30. typedef char __v16qi __attribute__((__vector_size__(16)));
  31.  
  32. /* Unsigned types */
  33. typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
  34. typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
  35. typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
  36.  
  37. /* We need an explicitly signed variant for char. Note that this shouldn't
  38.  * appear in the interface though. */
  39. typedef signed char __v16qs __attribute__((__vector_size__(16)));
  40.  
  41. #ifdef __SSE2__
  42. /* Both _Float16 and __bf16 require SSE2 being enabled. */
  43. typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
  44. typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
  45. typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
  46.  
  47. typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
  48. typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
  49. #endif
  50.  
  51. /* Define the default attributes for the functions in this file. */
  52. #define __DEFAULT_FN_ATTRS                                                     \
  53.   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
  54.                  __min_vector_width__(128)))
  55. #define __DEFAULT_FN_ATTRS_MMX                                                 \
  56.   __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"),       \
  57.                  __min_vector_width__(64)))
  58.  
  59. /// Adds lower double-precision values in both operands and returns the
  60. ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
  61. ///    are copied from the upper double-precision value of the first operand.
  62. ///
  63. /// \headerfile <x86intrin.h>
  64. ///
  65. /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
  66. ///
  67. /// \param __a
  68. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  69. /// \param __b
  70. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  71. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  72. ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
  73. ///    from the upper 64 bits of the first source operand.
  74. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
  75.                                                         __m128d __b) {
  76.   __a[0] += __b[0];
  77.   return __a;
  78. }
  79.  
  80. /// Adds two 128-bit vectors of [2 x double].
  81. ///
  82. /// \headerfile <x86intrin.h>
  83. ///
  84. /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
  85. ///
  86. /// \param __a
  87. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  88. /// \param __b
  89. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  90. /// \returns A 128-bit vector of [2 x double] containing the sums of both
  91. ///    operands.
  92. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
  93.                                                         __m128d __b) {
  94.   return (__m128d)((__v2df)__a + (__v2df)__b);
  95. }
  96.  
  97. /// Subtracts the lower double-precision value of the second operand
  98. ///    from the lower double-precision value of the first operand and returns
  99. ///    the difference in the lower 64 bits of the result. The upper 64 bits of
  100. ///    the result are copied from the upper double-precision value of the first
  101. ///    operand.
  102. ///
  103. /// \headerfile <x86intrin.h>
  104. ///
  105. /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
  106. ///
  107. /// \param __a
  108. ///    A 128-bit vector of [2 x double] containing the minuend.
  109. /// \param __b
  110. ///    A 128-bit vector of [2 x double] containing the subtrahend.
  111. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  112. ///    difference of the lower 64 bits of both operands. The upper 64 bits are
  113. ///    copied from the upper 64 bits of the first source operand.
  114. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
  115.                                                         __m128d __b) {
  116.   __a[0] -= __b[0];
  117.   return __a;
  118. }
  119.  
  120. /// Subtracts two 128-bit vectors of [2 x double].
  121. ///
  122. /// \headerfile <x86intrin.h>
  123. ///
  124. /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
  125. ///
  126. /// \param __a
  127. ///    A 128-bit vector of [2 x double] containing the minuend.
  128. /// \param __b
  129. ///    A 128-bit vector of [2 x double] containing the subtrahend.
  130. /// \returns A 128-bit vector of [2 x double] containing the differences between
  131. ///    both operands.
  132. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
  133.                                                         __m128d __b) {
  134.   return (__m128d)((__v2df)__a - (__v2df)__b);
  135. }
  136.  
  137. /// Multiplies lower double-precision values in both operands and returns
  138. ///    the product in the lower 64 bits of the result. The upper 64 bits of the
  139. ///    result are copied from the upper double-precision value of the first
  140. ///    operand.
  141. ///
  142. /// \headerfile <x86intrin.h>
  143. ///
  144. /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
  145. ///
  146. /// \param __a
  147. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  148. /// \param __b
  149. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  150. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  151. ///    product of the lower 64 bits of both operands. The upper 64 bits are
  152. ///    copied from the upper 64 bits of the first source operand.
  153. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
  154.                                                         __m128d __b) {
  155.   __a[0] *= __b[0];
  156.   return __a;
  157. }
  158.  
  159. /// Multiplies two 128-bit vectors of [2 x double].
  160. ///
  161. /// \headerfile <x86intrin.h>
  162. ///
  163. /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
  164. ///
  165. /// \param __a
  166. ///    A 128-bit vector of [2 x double] containing one of the operands.
  167. /// \param __b
  168. ///    A 128-bit vector of [2 x double] containing one of the operands.
  169. /// \returns A 128-bit vector of [2 x double] containing the products of both
  170. ///    operands.
  171. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
  172.                                                         __m128d __b) {
  173.   return (__m128d)((__v2df)__a * (__v2df)__b);
  174. }
  175.  
  176. /// Divides the lower double-precision value of the first operand by the
  177. ///    lower double-precision value of the second operand and returns the
  178. ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
  179. ///    result are copied from the upper double-precision value of the first
  180. ///    operand.
  181. ///
  182. /// \headerfile <x86intrin.h>
  183. ///
  184. /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
  185. ///
  186. /// \param __a
  187. ///    A 128-bit vector of [2 x double] containing the dividend.
  188. /// \param __b
  189. ///    A 128-bit vector of [2 x double] containing divisor.
  190. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  191. ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
  192. ///    copied from the upper 64 bits of the first source operand.
  193. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
  194.                                                         __m128d __b) {
  195.   __a[0] /= __b[0];
  196.   return __a;
  197. }
  198.  
  199. /// Performs an element-by-element division of two 128-bit vectors of
  200. ///    [2 x double].
  201. ///
  202. /// \headerfile <x86intrin.h>
  203. ///
  204. /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
  205. ///
  206. /// \param __a
  207. ///    A 128-bit vector of [2 x double] containing the dividend.
  208. /// \param __b
  209. ///    A 128-bit vector of [2 x double] containing the divisor.
  210. /// \returns A 128-bit vector of [2 x double] containing the quotients of both
  211. ///    operands.
  212. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
  213.                                                         __m128d __b) {
  214.   return (__m128d)((__v2df)__a / (__v2df)__b);
  215. }
  216.  
  217. /// Calculates the square root of the lower double-precision value of
  218. ///    the second operand and returns it in the lower 64 bits of the result.
  219. ///    The upper 64 bits of the result are copied from the upper
  220. ///    double-precision value of the first operand.
  221. ///
  222. /// \headerfile <x86intrin.h>
  223. ///
  224. /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
  225. ///
  226. /// \param __a
  227. ///    A 128-bit vector of [2 x double] containing one of the operands. The
  228. ///    upper 64 bits of this operand are copied to the upper 64 bits of the
  229. ///    result.
  230. /// \param __b
  231. ///    A 128-bit vector of [2 x double] containing one of the operands. The
  232. ///    square root is calculated using the lower 64 bits of this operand.
  233. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  234. ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
  235. ///    bits are copied from the upper 64 bits of operand \a __a.
  236. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
  237.                                                          __m128d __b) {
  238.   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
  239.   return __extension__(__m128d){__c[0], __a[1]};
  240. }
  241.  
  242. /// Calculates the square root of the each of two values stored in a
  243. ///    128-bit vector of [2 x double].
  244. ///
  245. /// \headerfile <x86intrin.h>
  246. ///
  247. /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
  248. ///
  249. /// \param __a
  250. ///    A 128-bit vector of [2 x double].
  251. /// \returns A 128-bit vector of [2 x double] containing the square roots of the
  252. ///    values in the operand.
  253. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
  254.   return __builtin_ia32_sqrtpd((__v2df)__a);
  255. }
  256.  
  257. /// Compares lower 64-bit double-precision values of both operands, and
  258. ///    returns the lesser of the pair of values in the lower 64-bits of the
  259. ///    result. The upper 64 bits of the result are copied from the upper
  260. ///    double-precision value of the first operand.
  261. ///
  262. /// \headerfile <x86intrin.h>
  263. ///
  264. /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
  265. ///
  266. /// \param __a
  267. ///    A 128-bit vector of [2 x double] containing one of the operands. The
  268. ///    lower 64 bits of this operand are used in the comparison.
  269. /// \param __b
  270. ///    A 128-bit vector of [2 x double] containing one of the operands. The
  271. ///    lower 64 bits of this operand are used in the comparison.
  272. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  273. ///    minimum value between both operands. The upper 64 bits are copied from
  274. ///    the upper 64 bits of the first source operand.
  275. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
  276.                                                         __m128d __b) {
  277.   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
  278. }
  279.  
  280. /// Performs element-by-element comparison of the two 128-bit vectors of
  281. ///    [2 x double] and returns the vector containing the lesser of each pair of
  282. ///    values.
  283. ///
  284. /// \headerfile <x86intrin.h>
  285. ///
  286. /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
  287. ///
  288. /// \param __a
  289. ///    A 128-bit vector of [2 x double] containing one of the operands.
  290. /// \param __b
  291. ///    A 128-bit vector of [2 x double] containing one of the operands.
  292. /// \returns A 128-bit vector of [2 x double] containing the minimum values
  293. ///    between both operands.
  294. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
  295.                                                         __m128d __b) {
  296.   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
  297. }
  298.  
  299. /// Compares lower 64-bit double-precision values of both operands, and
  300. ///    returns the greater of the pair of values in the lower 64-bits of the
  301. ///    result. The upper 64 bits of the result are copied from the upper
  302. ///    double-precision value of the first operand.
  303. ///
  304. /// \headerfile <x86intrin.h>
  305. ///
  306. /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
  307. ///
  308. /// \param __a
  309. ///    A 128-bit vector of [2 x double] containing one of the operands. The
  310. ///    lower 64 bits of this operand are used in the comparison.
  311. /// \param __b
  312. ///    A 128-bit vector of [2 x double] containing one of the operands. The
  313. ///    lower 64 bits of this operand are used in the comparison.
  314. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  315. ///    maximum value between both operands. The upper 64 bits are copied from
  316. ///    the upper 64 bits of the first source operand.
  317. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
  318.                                                         __m128d __b) {
  319.   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
  320. }
  321.  
  322. /// Performs element-by-element comparison of the two 128-bit vectors of
  323. ///    [2 x double] and returns the vector containing the greater of each pair
  324. ///    of values.
  325. ///
  326. /// \headerfile <x86intrin.h>
  327. ///
  328. /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
  329. ///
  330. /// \param __a
  331. ///    A 128-bit vector of [2 x double] containing one of the operands.
  332. /// \param __b
  333. ///    A 128-bit vector of [2 x double] containing one of the operands.
  334. /// \returns A 128-bit vector of [2 x double] containing the maximum values
  335. ///    between both operands.
  336. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
  337.                                                         __m128d __b) {
  338.   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
  339. }
  340.  
  341. /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
  342. ///
  343. /// \headerfile <x86intrin.h>
  344. ///
  345. /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
  346. ///
  347. /// \param __a
  348. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  349. /// \param __b
  350. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  351. /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
  352. ///    values between both operands.
  353. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
  354.                                                         __m128d __b) {
  355.   return (__m128d)((__v2du)__a & (__v2du)__b);
  356. }
  357.  
  358. /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
  359. ///    the one's complement of the values contained in the first source operand.
  360. ///
  361. /// \headerfile <x86intrin.h>
  362. ///
  363. /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
  364. ///
  365. /// \param __a
  366. ///    A 128-bit vector of [2 x double] containing the left source operand. The
  367. ///    one's complement of this value is used in the bitwise AND.
  368. /// \param __b
  369. ///    A 128-bit vector of [2 x double] containing the right source operand.
  370. /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
  371. ///    values in the second operand and the one's complement of the first
  372. ///    operand.
  373. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
  374.                                                            __m128d __b) {
  375.   return (__m128d)(~(__v2du)__a & (__v2du)__b);
  376. }
  377.  
  378. /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
  379. ///
  380. /// \headerfile <x86intrin.h>
  381. ///
  382. /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
  383. ///
  384. /// \param __a
  385. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  386. /// \param __b
  387. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  388. /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
  389. ///    values between both operands.
  390. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
  391.                                                        __m128d __b) {
  392.   return (__m128d)((__v2du)__a | (__v2du)__b);
  393. }
  394.  
  395. /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
  396. ///
  397. /// \headerfile <x86intrin.h>
  398. ///
  399. /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
  400. ///
  401. /// \param __a
  402. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  403. /// \param __b
  404. ///    A 128-bit vector of [2 x double] containing one of the source operands.
  405. /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
  406. ///    values between both operands.
  407. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
  408.                                                         __m128d __b) {
  409.   return (__m128d)((__v2du)__a ^ (__v2du)__b);
  410. }
  411.  
  412. /// Compares each of the corresponding double-precision values of the
  413. ///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
  414. ///    for false, 0xFFFFFFFFFFFFFFFF for true.
  415. ///
  416. /// \headerfile <x86intrin.h>
  417. ///
  418. /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
  419. ///
  420. /// \param __a
  421. ///    A 128-bit vector of [2 x double].
  422. /// \param __b
  423. ///    A 128-bit vector of [2 x double].
  424. /// \returns A 128-bit vector containing the comparison results.
  425. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
  426.                                                           __m128d __b) {
  427.   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
  428. }
  429.  
  430. /// Compares each of the corresponding double-precision values of the
  431. ///    128-bit vectors of [2 x double] to determine if the values in the first
  432. ///    operand are less than those in the second operand. Each comparison
  433. ///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  434. ///
  435. /// \headerfile <x86intrin.h>
  436. ///
  437. /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
  438. ///
  439. /// \param __a
  440. ///    A 128-bit vector of [2 x double].
  441. /// \param __b
  442. ///    A 128-bit vector of [2 x double].
  443. /// \returns A 128-bit vector containing the comparison results.
  444. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
  445.                                                           __m128d __b) {
  446.   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
  447. }
  448.  
  449. /// Compares each of the corresponding double-precision values of the
  450. ///    128-bit vectors of [2 x double] to determine if the values in the first
  451. ///    operand are less than or equal to those in the second operand.
  452. ///
  453. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  454. ///
  455. /// \headerfile <x86intrin.h>
  456. ///
  457. /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
  458. ///
  459. /// \param __a
  460. ///    A 128-bit vector of [2 x double].
  461. /// \param __b
  462. ///    A 128-bit vector of [2 x double].
  463. /// \returns A 128-bit vector containing the comparison results.
  464. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
  465.                                                           __m128d __b) {
  466.   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
  467. }
  468.  
  469. /// Compares each of the corresponding double-precision values of the
  470. ///    128-bit vectors of [2 x double] to determine if the values in the first
  471. ///    operand are greater than those in the second operand.
  472. ///
  473. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  474. ///
  475. /// \headerfile <x86intrin.h>
  476. ///
  477. /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
  478. ///
  479. /// \param __a
  480. ///    A 128-bit vector of [2 x double].
  481. /// \param __b
  482. ///    A 128-bit vector of [2 x double].
  483. /// \returns A 128-bit vector containing the comparison results.
  484. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
  485.                                                           __m128d __b) {
  486.   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
  487. }
  488.  
  489. /// Compares each of the corresponding double-precision values of the
  490. ///    128-bit vectors of [2 x double] to determine if the values in the first
  491. ///    operand are greater than or equal to those in the second operand.
  492. ///
  493. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  494. ///
  495. /// \headerfile <x86intrin.h>
  496. ///
  497. /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
  498. ///
  499. /// \param __a
  500. ///    A 128-bit vector of [2 x double].
  501. /// \param __b
  502. ///    A 128-bit vector of [2 x double].
  503. /// \returns A 128-bit vector containing the comparison results.
  504. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
  505.                                                           __m128d __b) {
  506.   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
  507. }
  508.  
  509. /// Compares each of the corresponding double-precision values of the
  510. ///    128-bit vectors of [2 x double] to determine if the values in the first
  511. ///    operand are ordered with respect to those in the second operand.
  512. ///
  513. ///    A pair of double-precision values are "ordered" with respect to each
  514. ///    other if neither value is a NaN. Each comparison yields 0x0 for false,
  515. ///    0xFFFFFFFFFFFFFFFF for true.
  516. ///
  517. /// \headerfile <x86intrin.h>
  518. ///
  519. /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
  520. ///
  521. /// \param __a
  522. ///    A 128-bit vector of [2 x double].
  523. /// \param __b
  524. ///    A 128-bit vector of [2 x double].
  525. /// \returns A 128-bit vector containing the comparison results.
  526. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
  527.                                                            __m128d __b) {
  528.   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
  529. }
  530.  
  531. /// Compares each of the corresponding double-precision values of the
  532. ///    128-bit vectors of [2 x double] to determine if the values in the first
  533. ///    operand are unordered with respect to those in the second operand.
  534. ///
  535. ///    A pair of double-precision values are "unordered" with respect to each
  536. ///    other if one or both values are NaN. Each comparison yields 0x0 for
  537. ///    false, 0xFFFFFFFFFFFFFFFF for true.
  538. ///
  539. /// \headerfile <x86intrin.h>
  540. ///
  541. /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
  542. ///   instruction.
  543. ///
  544. /// \param __a
  545. ///    A 128-bit vector of [2 x double].
  546. /// \param __b
  547. ///    A 128-bit vector of [2 x double].
  548. /// \returns A 128-bit vector containing the comparison results.
  549. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
  550.                                                              __m128d __b) {
  551.   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
  552. }
  553.  
  554. /// Compares each of the corresponding double-precision values of the
  555. ///    128-bit vectors of [2 x double] to determine if the values in the first
  556. ///    operand are unequal to those in the second operand.
  557. ///
  558. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  559. ///
  560. /// \headerfile <x86intrin.h>
  561. ///
  562. /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
  563. ///
  564. /// \param __a
  565. ///    A 128-bit vector of [2 x double].
  566. /// \param __b
  567. ///    A 128-bit vector of [2 x double].
  568. /// \returns A 128-bit vector containing the comparison results.
  569. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
  570.                                                            __m128d __b) {
  571.   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
  572. }
  573.  
  574. /// Compares each of the corresponding double-precision values of the
  575. ///    128-bit vectors of [2 x double] to determine if the values in the first
  576. ///    operand are not less than those in the second operand.
  577. ///
  578. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  579. ///
  580. /// \headerfile <x86intrin.h>
  581. ///
  582. /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
  583. ///
  584. /// \param __a
  585. ///    A 128-bit vector of [2 x double].
  586. /// \param __b
  587. ///    A 128-bit vector of [2 x double].
  588. /// \returns A 128-bit vector containing the comparison results.
  589. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
  590.                                                            __m128d __b) {
  591.   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
  592. }
  593.  
  594. /// Compares each of the corresponding double-precision values of the
  595. ///    128-bit vectors of [2 x double] to determine if the values in the first
  596. ///    operand are not less than or equal to those in the second operand.
  597. ///
  598. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  599. ///
  600. /// \headerfile <x86intrin.h>
  601. ///
  602. /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
  603. ///
  604. /// \param __a
  605. ///    A 128-bit vector of [2 x double].
  606. /// \param __b
  607. ///    A 128-bit vector of [2 x double].
  608. /// \returns A 128-bit vector containing the comparison results.
  609. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
  610.                                                            __m128d __b) {
  611.   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
  612. }
  613.  
  614. /// Compares each of the corresponding double-precision values of the
  615. ///    128-bit vectors of [2 x double] to determine if the values in the first
  616. ///    operand are not greater than those in the second operand.
  617. ///
  618. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  619. ///
  620. /// \headerfile <x86intrin.h>
  621. ///
  622. /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
  623. ///
  624. /// \param __a
  625. ///    A 128-bit vector of [2 x double].
  626. /// \param __b
  627. ///    A 128-bit vector of [2 x double].
  628. /// \returns A 128-bit vector containing the comparison results.
  629. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
  630.                                                            __m128d __b) {
  631.   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
  632. }
  633.  
  634. /// Compares each of the corresponding double-precision values of the
  635. ///    128-bit vectors of [2 x double] to determine if the values in the first
  636. ///    operand are not greater than or equal to those in the second operand.
  637. ///
  638. ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  639. ///
  640. /// \headerfile <x86intrin.h>
  641. ///
  642. /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
  643. ///
  644. /// \param __a
  645. ///    A 128-bit vector of [2 x double].
  646. /// \param __b
  647. ///    A 128-bit vector of [2 x double].
  648. /// \returns A 128-bit vector containing the comparison results.
  649. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
  650.                                                            __m128d __b) {
  651.   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
  652. }
  653.  
  654. /// Compares the lower double-precision floating-point values in each of
  655. ///    the two 128-bit floating-point vectors of [2 x double] for equality.
  656. ///
  657. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  658. ///
  659. /// \headerfile <x86intrin.h>
  660. ///
  661. /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
  662. ///
  663. /// \param __a
  664. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  665. ///    compared to the lower double-precision value of \a __b.
  666. /// \param __b
  667. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  668. ///    compared to the lower double-precision value of \a __a.
  669. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  670. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  671. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
  672.                                                           __m128d __b) {
  673.   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
  674. }
  675.  
  676. /// Compares the lower double-precision floating-point values in each of
  677. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  678. ///    the value in the first parameter is less than the corresponding value in
  679. ///    the second parameter.
  680. ///
  681. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  682. ///
  683. /// \headerfile <x86intrin.h>
  684. ///
  685. /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
  686. ///
  687. /// \param __a
  688. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  689. ///    compared to the lower double-precision value of \a __b.
  690. /// \param __b
  691. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  692. ///    compared to the lower double-precision value of \a __a.
  693. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  694. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  695. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
  696.                                                           __m128d __b) {
  697.   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
  698. }
  699.  
  700. /// Compares the lower double-precision floating-point values in each of
  701. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  702. ///    the value in the first parameter is less than or equal to the
  703. ///    corresponding value in the second parameter.
  704. ///
  705. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  706. ///
  707. /// \headerfile <x86intrin.h>
  708. ///
  709. /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
  710. ///
  711. /// \param __a
  712. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  713. ///    compared to the lower double-precision value of \a __b.
  714. /// \param __b
  715. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  716. ///    compared to the lower double-precision value of \a __a.
  717. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  718. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  719. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
  720.                                                           __m128d __b) {
  721.   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
  722. }
  723.  
  724. /// Compares the lower double-precision floating-point values in each of
  725. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  726. ///    the value in the first parameter is greater than the corresponding value
  727. ///    in the second parameter.
  728. ///
  729. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  730. ///
  731. /// \headerfile <x86intrin.h>
  732. ///
  733. /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
  734. ///
  735. /// \param __a
  736. ///     A 128-bit vector of [2 x double]. The lower double-precision value is
  737. ///     compared to the lower double-precision value of \a __b.
  738. /// \param __b
  739. ///     A 128-bit vector of [2 x double]. The lower double-precision value is
  740. ///     compared to the lower double-precision value of \a __a.
  741. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  742. ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  743. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
  744.                                                           __m128d __b) {
  745.   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
  746.   return __extension__(__m128d){__c[0], __a[1]};
  747. }
  748.  
  749. /// Compares the lower double-precision floating-point values in each of
  750. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  751. ///    the value in the first parameter is greater than or equal to the
  752. ///    corresponding value in the second parameter.
  753. ///
  754. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  755. ///
  756. /// \headerfile <x86intrin.h>
  757. ///
  758. /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
  759. ///
  760. /// \param __a
  761. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  762. ///    compared to the lower double-precision value of \a __b.
  763. /// \param __b
  764. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  765. ///    compared to the lower double-precision value of \a __a.
  766. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  767. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  768. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
  769.                                                           __m128d __b) {
  770.   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
  771.   return __extension__(__m128d){__c[0], __a[1]};
  772. }
  773.  
  774. /// Compares the lower double-precision floating-point values in each of
  775. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  776. ///    the value in the first parameter is "ordered" with respect to the
  777. ///    corresponding value in the second parameter.
  778. ///
  779. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
  780. ///    of double-precision values are "ordered" with respect to each other if
  781. ///    neither value is a NaN.
  782. ///
  783. /// \headerfile <x86intrin.h>
  784. ///
  785. /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
  786. ///
  787. /// \param __a
  788. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  789. ///    compared to the lower double-precision value of \a __b.
  790. /// \param __b
  791. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  792. ///    compared to the lower double-precision value of \a __a.
  793. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  794. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  795. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
  796.                                                            __m128d __b) {
  797.   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
  798. }
  799.  
  800. /// Compares the lower double-precision floating-point values in each of
  801. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  802. ///    the value in the first parameter is "unordered" with respect to the
  803. ///    corresponding value in the second parameter.
  804. ///
  805. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
  806. ///    of double-precision values are "unordered" with respect to each other if
  807. ///    one or both values are NaN.
  808. ///
  809. /// \headerfile <x86intrin.h>
  810. ///
  811. /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
  812. ///   instruction.
  813. ///
  814. /// \param __a
  815. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  816. ///    compared to the lower double-precision value of \a __b.
  817. /// \param __b
  818. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  819. ///    compared to the lower double-precision value of \a __a.
  820. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  821. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  822. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
  823.                                                              __m128d __b) {
  824.   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
  825. }
  826.  
  827. /// Compares the lower double-precision floating-point values in each of
  828. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  829. ///    the value in the first parameter is unequal to the corresponding value in
  830. ///    the second parameter.
  831. ///
  832. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  833. ///
  834. /// \headerfile <x86intrin.h>
  835. ///
  836. /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
  837. ///
  838. /// \param __a
  839. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  840. ///    compared to the lower double-precision value of \a __b.
  841. /// \param __b
  842. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  843. ///    compared to the lower double-precision value of \a __a.
  844. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  845. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  846. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
  847.                                                            __m128d __b) {
  848.   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
  849. }
  850.  
  851. /// Compares the lower double-precision floating-point values in each of
  852. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  853. ///    the value in the first parameter is not less than the corresponding
  854. ///    value in the second parameter.
  855. ///
  856. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  857. ///
  858. /// \headerfile <x86intrin.h>
  859. ///
  860. /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
  861. ///
  862. /// \param __a
  863. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  864. ///    compared to the lower double-precision value of \a __b.
  865. /// \param __b
  866. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  867. ///    compared to the lower double-precision value of \a __a.
  868. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  869. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  870. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
  871.                                                            __m128d __b) {
  872.   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
  873. }
  874.  
  875. /// Compares the lower double-precision floating-point values in each of
  876. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  877. ///    the value in the first parameter is not less than or equal to the
  878. ///    corresponding value in the second parameter.
  879. ///
  880. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  881. ///
  882. /// \headerfile <x86intrin.h>
  883. ///
  884. /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
  885. ///
  886. /// \param __a
  887. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  888. ///    compared to the lower double-precision value of \a __b.
  889. /// \param __b
  890. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  891. ///    compared to the lower double-precision value of \a __a.
  892. /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
  893. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  894. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
  895.                                                            __m128d __b) {
  896.   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
  897. }
  898.  
  899. /// Compares the lower double-precision floating-point values in each of
  900. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  901. ///    the value in the first parameter is not greater than the corresponding
  902. ///    value in the second parameter.
  903. ///
  904. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  905. ///
  906. /// \headerfile <x86intrin.h>
  907. ///
  908. /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
  909. ///
  910. /// \param __a
  911. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  912. ///    compared to the lower double-precision value of \a __b.
  913. /// \param __b
  914. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  915. ///    compared to the lower double-precision value of \a __a.
  916. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  917. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  918. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
  919.                                                            __m128d __b) {
  920.   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
  921.   return __extension__(__m128d){__c[0], __a[1]};
  922. }
  923.  
  924. /// Compares the lower double-precision floating-point values in each of
  925. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  926. ///    the value in the first parameter is not greater than or equal to the
  927. ///    corresponding value in the second parameter.
  928. ///
  929. ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
  930. ///
  931. /// \headerfile <x86intrin.h>
  932. ///
  933. /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
  934. ///
  935. /// \param __a
  936. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  937. ///    compared to the lower double-precision value of \a __b.
  938. /// \param __b
  939. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  940. ///    compared to the lower double-precision value of \a __a.
  941. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  942. ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  943. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
  944.                                                            __m128d __b) {
  945.   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
  946.   return __extension__(__m128d){__c[0], __a[1]};
  947. }
  948.  
  949. /// Compares the lower double-precision floating-point values in each of
  950. ///    the two 128-bit floating-point vectors of [2 x double] for equality.
  951. ///
  952. ///    The comparison yields 0 for false, 1 for true. If either of the two
  953. ///    lower double-precision values is NaN, 0 is returned.
  954. ///
  955. /// \headerfile <x86intrin.h>
  956. ///
  957. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  958. ///
  959. /// \param __a
  960. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  961. ///    compared to the lower double-precision value of \a __b.
  962. /// \param __b
  963. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  964. ///    compared to the lower double-precision value of \a __a.
  965. /// \returns An integer containing the comparison results. If either of the two
  966. ///    lower double-precision values is NaN, 0 is returned.
  967. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
  968.                                                        __m128d __b) {
  969.   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
  970. }
  971.  
  972. /// Compares the lower double-precision floating-point values in each of
  973. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  974. ///    the value in the first parameter is less than the corresponding value in
  975. ///    the second parameter.
  976. ///
  977. ///    The comparison yields 0 for false, 1 for true. If either of the two
  978. ///    lower double-precision values is NaN, 0 is returned.
  979. ///
  980. /// \headerfile <x86intrin.h>
  981. ///
  982. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  983. ///
  984. /// \param __a
  985. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  986. ///    compared to the lower double-precision value of \a __b.
  987. /// \param __b
  988. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  989. ///    compared to the lower double-precision value of \a __a.
  990. /// \returns An integer containing the comparison results. If either of the two
  991. ///     lower double-precision values is NaN, 0 is returned.
  992. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
  993.                                                        __m128d __b) {
  994.   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
  995. }
  996.  
  997. /// Compares the lower double-precision floating-point values in each of
  998. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  999. ///    the value in the first parameter is less than or equal to the
  1000. ///    corresponding value in the second parameter.
  1001. ///
  1002. ///    The comparison yields 0 for false, 1 for true. If either of the two
  1003. ///    lower double-precision values is NaN, 0 is returned.
  1004. ///
  1005. /// \headerfile <x86intrin.h>
  1006. ///
  1007. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1008. ///
  1009. /// \param __a
  1010. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1011. ///    compared to the lower double-precision value of \a __b.
  1012. /// \param __b
  1013. ///     A 128-bit vector of [2 x double]. The lower double-precision value is
  1014. ///     compared to the lower double-precision value of \a __a.
  1015. /// \returns An integer containing the comparison results. If either of the two
  1016. ///     lower double-precision values is NaN, 0 is returned.
  1017. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
  1018.                                                        __m128d __b) {
  1019.   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
  1020. }
  1021.  
  1022. /// Compares the lower double-precision floating-point values in each of
  1023. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1024. ///    the value in the first parameter is greater than the corresponding value
  1025. ///    in the second parameter.
  1026. ///
  1027. ///    The comparison yields 0 for false, 1 for true. If either of the two
  1028. ///    lower double-precision values is NaN, 0 is returned.
  1029. ///
  1030. /// \headerfile <x86intrin.h>
  1031. ///
  1032. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1033. ///
  1034. /// \param __a
  1035. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1036. ///    compared to the lower double-precision value of \a __b.
  1037. /// \param __b
  1038. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1039. ///    compared to the lower double-precision value of \a __a.
  1040. /// \returns An integer containing the comparison results. If either of the two
  1041. ///     lower double-precision values is NaN, 0 is returned.
  1042. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
  1043.                                                        __m128d __b) {
  1044.   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
  1045. }
  1046.  
  1047. /// Compares the lower double-precision floating-point values in each of
  1048. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1049. ///    the value in the first parameter is greater than or equal to the
  1050. ///    corresponding value in the second parameter.
  1051. ///
  1052. ///    The comparison yields 0 for false, 1 for true. If either of the two
  1053. ///    lower double-precision values is NaN, 0 is returned.
  1054. ///
  1055. /// \headerfile <x86intrin.h>
  1056. ///
  1057. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1058. ///
  1059. /// \param __a
  1060. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1061. ///    compared to the lower double-precision value of \a __b.
  1062. /// \param __b
  1063. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1064. ///    compared to the lower double-precision value of \a __a.
  1065. /// \returns An integer containing the comparison results. If either of the two
  1066. ///    lower double-precision values is NaN, 0 is returned.
  1067. static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
  1068.                                                        __m128d __b) {
  1069.   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
  1070. }
  1071.  
  1072. /// Compares the lower double-precision floating-point values in each of
  1073. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1074. ///    the value in the first parameter is unequal to the corresponding value in
  1075. ///    the second parameter.
  1076. ///
  1077. ///    The comparison yields 0 for false, 1 for true. If either of the two
  1078. ///    lower double-precision values is NaN, 1 is returned.
  1079. ///
  1080. /// \headerfile <x86intrin.h>
  1081. ///
  1082. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1083. ///
  1084. /// \param __a
  1085. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1086. ///    compared to the lower double-precision value of \a __b.
  1087. /// \param __b
  1088. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1089. ///    compared to the lower double-precision value of \a __a.
  1090. /// \returns An integer containing the comparison results. If either of the two
  1091. ///     lower double-precision values is NaN, 1 is returned.
  1092. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
  1093.                                                         __m128d __b) {
  1094.   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
  1095. }
  1096.  
  1097. /// Compares the lower double-precision floating-point values in each of
  1098. ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
  1099. ///    comparison yields 0 for false, 1 for true.
  1100. ///
  1101. ///    If either of the two lower double-precision values is NaN, 0 is returned.
  1102. ///
  1103. /// \headerfile <x86intrin.h>
  1104. ///
  1105. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1106. ///
  1107. /// \param __a
  1108. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1109. ///    compared to the lower double-precision value of \a __b.
  1110. /// \param __b
  1111. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1112. ///    compared to the lower double-precision value of \a __a.
  1113. /// \returns An integer containing the comparison results. If either of the two
  1114. ///    lower double-precision values is NaN, 0 is returned.
  1115. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
  1116.                                                         __m128d __b) {
  1117.   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
  1118. }
  1119.  
  1120. /// Compares the lower double-precision floating-point values in each of
  1121. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1122. ///    the value in the first parameter is less than the corresponding value in
  1123. ///    the second parameter.
  1124. ///
  1125. ///    The comparison yields 0 for false, 1 for true. If either of the two lower
  1126. ///    double-precision values is NaN, 0 is returned.
  1127. ///
  1128. /// \headerfile <x86intrin.h>
  1129. ///
  1130. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1131. ///
  1132. /// \param __a
  1133. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1134. ///    compared to the lower double-precision value of \a __b.
  1135. /// \param __b
  1136. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1137. ///    compared to the lower double-precision value of \a __a.
  1138. /// \returns An integer containing the comparison results. If either of the two
  1139. ///    lower double-precision values is NaN, 0 is returned.
  1140. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
  1141.                                                         __m128d __b) {
  1142.   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
  1143. }
  1144.  
  1145. /// Compares the lower double-precision floating-point values in each of
  1146. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1147. ///    the value in the first parameter is less than or equal to the
  1148. ///    corresponding value in the second parameter.
  1149. ///
  1150. ///    The comparison yields 0 for false, 1 for true. If either of the two lower
  1151. ///    double-precision values is NaN, 0 is returned.
  1152. ///
  1153. /// \headerfile <x86intrin.h>
  1154. ///
  1155. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1156. ///
  1157. /// \param __a
  1158. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1159. ///    compared to the lower double-precision value of \a __b.
  1160. /// \param __b
  1161. ///     A 128-bit vector of [2 x double]. The lower double-precision value is
  1162. ///     compared to the lower double-precision value of \a __a.
  1163. /// \returns An integer containing the comparison results. If either of the two
  1164. ///     lower double-precision values is NaN, 0 is returned.
  1165. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
  1166.                                                         __m128d __b) {
  1167.   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
  1168. }
  1169.  
  1170. /// Compares the lower double-precision floating-point values in each of
  1171. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1172. ///    the value in the first parameter is greater than the corresponding value
  1173. ///    in the second parameter.
  1174. ///
  1175. ///    The comparison yields 0 for false, 1 for true. If either of the two lower
  1176. ///    double-precision values is NaN, 0 is returned.
  1177. ///
  1178. /// \headerfile <x86intrin.h>
  1179. ///
  1180. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1181. ///
  1182. /// \param __a
  1183. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1184. ///    compared to the lower double-precision value of \a __b.
  1185. /// \param __b
  1186. ///     A 128-bit vector of [2 x double]. The lower double-precision value is
  1187. ///     compared to the lower double-precision value of \a __a.
  1188. /// \returns An integer containing the comparison results. If either of the two
  1189. ///     lower double-precision values is NaN, 0 is returned.
  1190. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
  1191.                                                         __m128d __b) {
  1192.   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
  1193. }
  1194.  
  1195. /// Compares the lower double-precision floating-point values in each of
  1196. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1197. ///    the value in the first parameter is greater than or equal to the
  1198. ///    corresponding value in the second parameter.
  1199. ///
  1200. ///    The comparison yields 0 for false, 1 for true.  If either of the two
  1201. ///    lower double-precision values is NaN, 0 is returned.
  1202. ///
  1203. /// \headerfile <x86intrin.h>
  1204. ///
  1205. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1206. ///
  1207. /// \param __a
  1208. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1209. ///    compared to the lower double-precision value of \a __b.
  1210. /// \param __b
  1211. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1212. ///    compared to the lower double-precision value of \a __a.
  1213. /// \returns An integer containing the comparison results. If either of the two
  1214. ///    lower double-precision values is NaN, 0 is returned.
  1215. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
  1216.                                                         __m128d __b) {
  1217.   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
  1218. }
  1219.  
  1220. /// Compares the lower double-precision floating-point values in each of
  1221. ///    the two 128-bit floating-point vectors of [2 x double] to determine if
  1222. ///    the value in the first parameter is unequal to the corresponding value in
  1223. ///    the second parameter.
  1224. ///
  1225. ///    The comparison yields 0 for false, 1 for true. If either of the two lower
  1226. ///    double-precision values is NaN, 1 is returned.
  1227. ///
  1228. /// \headerfile <x86intrin.h>
  1229. ///
  1230. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1231. ///
  1232. /// \param __a
  1233. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1234. ///    compared to the lower double-precision value of \a __b.
  1235. /// \param __b
  1236. ///    A 128-bit vector of [2 x double]. The lower double-precision value is
  1237. ///    compared to the lower double-precision value of \a __a.
  1238. /// \returns An integer containing the comparison result. If either of the two
  1239. ///    lower double-precision values is NaN, 1 is returned.
  1240. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
  1241.                                                          __m128d __b) {
  1242.   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
  1243. }
  1244.  
  1245. /// Converts the two double-precision floating-point elements of a
  1246. ///    128-bit vector of [2 x double] into two single-precision floating-point
  1247. ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
  1248. ///    The upper 64 bits of the result vector are set to zero.
  1249. ///
  1250. /// \headerfile <x86intrin.h>
  1251. ///
  1252. /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
  1253. ///
  1254. /// \param __a
  1255. ///    A 128-bit vector of [2 x double].
  1256. /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
  1257. ///    converted values. The upper 64 bits are set to zero.
  1258. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
  1259.   return __builtin_ia32_cvtpd2ps((__v2df)__a);
  1260. }
  1261.  
  1262. /// Converts the lower two single-precision floating-point elements of a
  1263. ///    128-bit vector of [4 x float] into two double-precision floating-point
  1264. ///    values, returned in a 128-bit vector of [2 x double]. The upper two
  1265. ///    elements of the input vector are unused.
  1266. ///
  1267. /// \headerfile <x86intrin.h>
  1268. ///
  1269. /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
  1270. ///
  1271. /// \param __a
  1272. ///    A 128-bit vector of [4 x float]. The lower two single-precision
  1273. ///    floating-point elements are converted to double-precision values. The
  1274. ///    upper two elements are unused.
  1275. /// \returns A 128-bit vector of [2 x double] containing the converted values.
  1276. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
  1277.   return (__m128d) __builtin_convertvector(
  1278.       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
  1279. }
  1280.  
  1281. /// Converts the lower two integer elements of a 128-bit vector of
  1282. ///    [4 x i32] into two double-precision floating-point values, returned in a
  1283. ///    128-bit vector of [2 x double].
  1284. ///
  1285. ///    The upper two elements of the input vector are unused.
  1286. ///
  1287. /// \headerfile <x86intrin.h>
  1288. ///
  1289. /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
  1290. ///
  1291. /// \param __a
  1292. ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
  1293. ///    converted to double-precision values.
  1294. ///
  1295. ///    The upper two elements are unused.
  1296. /// \returns A 128-bit vector of [2 x double] containing the converted values.
  1297. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
  1298.   return (__m128d) __builtin_convertvector(
  1299.       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
  1300. }
  1301.  
  1302. /// Converts the two double-precision floating-point elements of a
  1303. ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
  1304. ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
  1305. ///    64 bits of the result vector are set to zero.
  1306. ///
  1307. /// \headerfile <x86intrin.h>
  1308. ///
  1309. /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
  1310. ///
  1311. /// \param __a
  1312. ///    A 128-bit vector of [2 x double].
  1313. /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
  1314. ///    converted values. The upper 64 bits are set to zero.
  1315. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
  1316.   return __builtin_ia32_cvtpd2dq((__v2df)__a);
  1317. }
  1318.  
  1319. /// Converts the low-order element of a 128-bit vector of [2 x double]
  1320. ///    into a 32-bit signed integer value.
  1321. ///
  1322. /// \headerfile <x86intrin.h>
  1323. ///
  1324. /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
  1325. ///
  1326. /// \param __a
  1327. ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  1328. ///    conversion.
  1329. /// \returns A 32-bit signed integer containing the converted value.
  1330. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
  1331.   return __builtin_ia32_cvtsd2si((__v2df)__a);
  1332. }
  1333.  
  1334. /// Converts the lower double-precision floating-point element of a
  1335. ///    128-bit vector of [2 x double], in the second parameter, into a
  1336. ///    single-precision floating-point value, returned in the lower 32 bits of a
  1337. ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
  1338. ///    copied from the upper 96 bits of the first parameter.
  1339. ///
  1340. /// \headerfile <x86intrin.h>
  1341. ///
  1342. /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
  1343. ///
  1344. /// \param __a
  1345. ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
  1346. ///    copied to the upper 96 bits of the result.
  1347. /// \param __b
  1348. ///    A 128-bit vector of [2 x double]. The lower double-precision
  1349. ///    floating-point element is used in the conversion.
  1350. /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
  1351. ///    converted value from the second parameter. The upper 96 bits are copied
  1352. ///    from the upper 96 bits of the first parameter.
  1353. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
  1354.                                                          __m128d __b) {
  1355.   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
  1356. }
  1357.  
  1358. /// Converts a 32-bit signed integer value, in the second parameter, into
  1359. ///    a double-precision floating-point value, returned in the lower 64 bits of
  1360. ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
  1361. ///    are copied from the upper 64 bits of the first parameter.
  1362. ///
  1363. /// \headerfile <x86intrin.h>
  1364. ///
  1365. /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
  1366. ///
  1367. /// \param __a
  1368. ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
  1369. ///    copied to the upper 64 bits of the result.
  1370. /// \param __b
  1371. ///    A 32-bit signed integer containing the value to be converted.
  1372. /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
  1373. ///    converted value from the second parameter. The upper 64 bits are copied
  1374. ///    from the upper 64 bits of the first parameter.
  1375. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
  1376.                                                             int __b) {
  1377.   __a[0] = __b;
  1378.   return __a;
  1379. }
  1380.  
  1381. /// Converts the lower single-precision floating-point element of a
  1382. ///    128-bit vector of [4 x float], in the second parameter, into a
  1383. ///    double-precision floating-point value, returned in the lower 64 bits of
  1384. ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
  1385. ///    are copied from the upper 64 bits of the first parameter.
  1386. ///
  1387. /// \headerfile <x86intrin.h>
  1388. ///
  1389. /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
  1390. ///
  1391. /// \param __a
  1392. ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
  1393. ///    copied to the upper 64 bits of the result.
  1394. /// \param __b
  1395. ///    A 128-bit vector of [4 x float]. The lower single-precision
  1396. ///    floating-point element is used in the conversion.
  1397. /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
  1398. ///    converted value from the second parameter. The upper 64 bits are copied
  1399. ///    from the upper 64 bits of the first parameter.
  1400. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
  1401.                                                           __m128 __b) {
  1402.   __a[0] = __b[0];
  1403.   return __a;
  1404. }
  1405.  
  1406. /// Converts the two double-precision floating-point elements of a
  1407. ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
  1408. ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
  1409. ///
  1410. ///    If the result of either conversion is inexact, the result is truncated
  1411. ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
  1412. ///    64 bits of the result vector are set to zero.
  1413. ///
  1414. /// \headerfile <x86intrin.h>
  1415. ///
  1416. /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
  1417. ///   instruction.
  1418. ///
  1419. /// \param __a
  1420. ///    A 128-bit vector of [2 x double].
  1421. /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
  1422. ///    converted values. The upper 64 bits are set to zero.
  1423. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
  1424.   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
  1425. }
  1426.  
  1427. /// Converts the low-order element of a [2 x double] vector into a 32-bit
  1428. ///    signed integer value, truncating the result when it is inexact.
  1429. ///
  1430. /// \headerfile <x86intrin.h>
  1431. ///
  1432. /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
  1433. ///   instruction.
  1434. ///
  1435. /// \param __a
  1436. ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  1437. ///    conversion.
  1438. /// \returns A 32-bit signed integer containing the converted value.
  1439. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
  1440.   return __builtin_ia32_cvttsd2si((__v2df)__a);
  1441. }
  1442.  
  1443. /// Converts the two double-precision floating-point elements of a
  1444. ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
  1445. ///    returned in a 64-bit vector of [2 x i32].
  1446. ///
  1447. /// \headerfile <x86intrin.h>
  1448. ///
  1449. /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
  1450. ///
  1451. /// \param __a
  1452. ///    A 128-bit vector of [2 x double].
  1453. /// \returns A 64-bit vector of [2 x i32] containing the converted values.
  1454. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
  1455.   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
  1456. }
  1457.  
  1458. /// Converts the two double-precision floating-point elements of a
  1459. ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
  1460. ///    returned in a 64-bit vector of [2 x i32].
  1461. ///
  1462. ///    If the result of either conversion is inexact, the result is truncated
  1463. ///    (rounded towards zero) regardless of the current MXCSR setting.
  1464. ///
  1465. /// \headerfile <x86intrin.h>
  1466. ///
  1467. /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
  1468. ///
  1469. /// \param __a
  1470. ///    A 128-bit vector of [2 x double].
  1471. /// \returns A 64-bit vector of [2 x i32] containing the converted values.
  1472. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
  1473.   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
  1474. }
  1475.  
  1476. /// Converts the two signed 32-bit integer elements of a 64-bit vector of
  1477. ///    [2 x i32] into two double-precision floating-point values, returned in a
  1478. ///    128-bit vector of [2 x double].
  1479. ///
  1480. /// \headerfile <x86intrin.h>
  1481. ///
  1482. /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
  1483. ///
  1484. /// \param __a
  1485. ///    A 64-bit vector of [2 x i32].
  1486. /// \returns A 128-bit vector of [2 x double] containing the converted values.
  1487. static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
  1488.   return __builtin_ia32_cvtpi2pd((__v2si)__a);
  1489. }
  1490.  
  1491. /// Returns the low-order element of a 128-bit vector of [2 x double] as
  1492. ///    a double-precision floating-point value.
  1493. ///
  1494. /// \headerfile <x86intrin.h>
  1495. ///
  1496. /// This intrinsic has no corresponding instruction.
  1497. ///
  1498. /// \param __a
  1499. ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
  1500. /// \returns A double-precision floating-point value copied from the lower 64
  1501. ///    bits of \a __a.
  1502. static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
  1503.   return __a[0];
  1504. }
  1505.  
  1506. /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
  1507. ///    memory location.
  1508. ///
  1509. /// \headerfile <x86intrin.h>
  1510. ///
  1511. /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
  1512. ///
  1513. /// \param __dp
  1514. ///    A pointer to a 128-bit memory location. The address of the memory
  1515. ///    location has to be 16-byte aligned.
  1516. /// \returns A 128-bit vector of [2 x double] containing the loaded values.
  1517. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
  1518.   return *(const __m128d *)__dp;
  1519. }
  1520.  
  1521. /// Loads a double-precision floating-point value from a specified memory
  1522. ///    location and duplicates it to both vector elements of a 128-bit vector of
  1523. ///    [2 x double].
  1524. ///
  1525. /// \headerfile <x86intrin.h>
  1526. ///
  1527. /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
  1528. ///
  1529. /// \param __dp
  1530. ///    A pointer to a memory location containing a double-precision value.
  1531. /// \returns A 128-bit vector of [2 x double] containing the loaded and
  1532. ///    duplicated values.
  1533. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
  1534.   struct __mm_load1_pd_struct {
  1535.     double __u;
  1536.   } __attribute__((__packed__, __may_alias__));
  1537.   double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
  1538.   return __extension__(__m128d){__u, __u};
  1539. }
  1540.  
  1541. #define _mm_load_pd1(dp) _mm_load1_pd(dp)
  1542.  
  1543. /// Loads two double-precision values, in reverse order, from an aligned
  1544. ///    memory location into a 128-bit vector of [2 x double].
  1545. ///
  1546. /// \headerfile <x86intrin.h>
  1547. ///
  1548. /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
  1549. /// needed shuffling instructions. In AVX mode, the shuffling may be combined
  1550. /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
  1551. ///
  1552. /// \param __dp
  1553. ///    A 16-byte aligned pointer to an array of double-precision values to be
  1554. ///    loaded in reverse order.
  1555. /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
  1556. ///    values.
  1557. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
  1558.   __m128d __u = *(const __m128d *)__dp;
  1559.   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
  1560. }
  1561.  
  1562. /// Loads a 128-bit floating-point vector of [2 x double] from an
  1563. ///    unaligned memory location.
  1564. ///
  1565. /// \headerfile <x86intrin.h>
  1566. ///
  1567. /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
  1568. ///
  1569. /// \param __dp
  1570. ///    A pointer to a 128-bit memory location. The address of the memory
  1571. ///    location does not have to be aligned.
  1572. /// \returns A 128-bit vector of [2 x double] containing the loaded values.
  1573. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
  1574.   struct __loadu_pd {
  1575.     __m128d_u __v;
  1576.   } __attribute__((__packed__, __may_alias__));
  1577.   return ((const struct __loadu_pd *)__dp)->__v;
  1578. }
  1579.  
  1580. /// Loads a 64-bit integer value to the low element of a 128-bit integer
  1581. ///    vector and clears the upper element.
  1582. ///
  1583. /// \headerfile <x86intrin.h>
  1584. ///
  1585. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  1586. ///
  1587. /// \param __a
  1588. ///    A pointer to a 64-bit memory location. The address of the memory
  1589. ///    location does not have to be aligned.
  1590. /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
  1591. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
  1592.   struct __loadu_si64 {
  1593.     long long __v;
  1594.   } __attribute__((__packed__, __may_alias__));
  1595.   long long __u = ((const struct __loadu_si64 *)__a)->__v;
  1596.   return __extension__(__m128i)(__v2di){__u, 0LL};
  1597. }
  1598.  
  1599. /// Loads a 32-bit integer value to the low element of a 128-bit integer
  1600. ///    vector and clears the upper element.
  1601. ///
  1602. /// \headerfile <x86intrin.h>
  1603. ///
  1604. /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
  1605. ///
  1606. /// \param __a
  1607. ///    A pointer to a 32-bit memory location. The address of the memory
  1608. ///    location does not have to be aligned.
  1609. /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
  1610. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
  1611.   struct __loadu_si32 {
  1612.     int __v;
  1613.   } __attribute__((__packed__, __may_alias__));
  1614.   int __u = ((const struct __loadu_si32 *)__a)->__v;
  1615.   return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
  1616. }
  1617.  
  1618. /// Loads a 16-bit integer value to the low element of a 128-bit integer
  1619. ///    vector and clears the upper element.
  1620. ///
  1621. /// \headerfile <x86intrin.h>
  1622. ///
  1623. /// This intrinsic does not correspond to a specific instruction.
  1624. ///
  1625. /// \param __a
  1626. ///    A pointer to a 16-bit memory location. The address of the memory
  1627. ///    location does not have to be aligned.
  1628. /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
  1629. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
  1630.   struct __loadu_si16 {
  1631.     short __v;
  1632.   } __attribute__((__packed__, __may_alias__));
  1633.   short __u = ((const struct __loadu_si16 *)__a)->__v;
  1634.   return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
  1635. }
  1636.  
  1637. /// Loads a 64-bit double-precision value to the low element of a
  1638. ///    128-bit integer vector and clears the upper element.
  1639. ///
  1640. /// \headerfile <x86intrin.h>
  1641. ///
  1642. /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
  1643. ///
  1644. /// \param __dp
  1645. ///    A pointer to a memory location containing a double-precision value.
  1646. ///    The address of the memory location does not have to be aligned.
  1647. /// \returns A 128-bit vector of [2 x double] containing the loaded value.
  1648. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
  1649.   struct __mm_load_sd_struct {
  1650.     double __u;
  1651.   } __attribute__((__packed__, __may_alias__));
  1652.   double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
  1653.   return __extension__(__m128d){__u, 0};
  1654. }
  1655.  
  1656. /// Loads a double-precision value into the high-order bits of a 128-bit
  1657. ///    vector of [2 x double]. The low-order bits are copied from the low-order
  1658. ///    bits of the first operand.
  1659. ///
  1660. /// \headerfile <x86intrin.h>
  1661. ///
  1662. /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
  1663. ///
  1664. /// \param __a
  1665. ///    A 128-bit vector of [2 x double]. \n
  1666. ///    Bits [63:0] are written to bits [63:0] of the result.
  1667. /// \param __dp
  1668. ///    A pointer to a 64-bit memory location containing a double-precision
  1669. ///    floating-point value that is loaded. The loaded value is written to bits
  1670. ///    [127:64] of the result. The address of the memory location does not have
  1671. ///    to be aligned.
  1672. /// \returns A 128-bit vector of [2 x double] containing the moved values.
  1673. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
  1674.                                                           double const *__dp) {
  1675.   struct __mm_loadh_pd_struct {
  1676.     double __u;
  1677.   } __attribute__((__packed__, __may_alias__));
  1678.   double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
  1679.   return __extension__(__m128d){__a[0], __u};
  1680. }
  1681.  
  1682. /// Loads a double-precision value into the low-order bits of a 128-bit
  1683. ///    vector of [2 x double]. The high-order bits are copied from the
  1684. ///    high-order bits of the first operand.
  1685. ///
  1686. /// \headerfile <x86intrin.h>
  1687. ///
  1688. /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
  1689. ///
  1690. /// \param __a
  1691. ///    A 128-bit vector of [2 x double]. \n
  1692. ///    Bits [127:64] are written to bits [127:64] of the result.
  1693. /// \param __dp
  1694. ///    A pointer to a 64-bit memory location containing a double-precision
  1695. ///    floating-point value that is loaded. The loaded value is written to bits
  1696. ///    [63:0] of the result. The address of the memory location does not have to
  1697. ///    be aligned.
  1698. /// \returns A 128-bit vector of [2 x double] containing the moved values.
  1699. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
  1700.                                                           double const *__dp) {
  1701.   struct __mm_loadl_pd_struct {
  1702.     double __u;
  1703.   } __attribute__((__packed__, __may_alias__));
  1704.   double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
  1705.   return __extension__(__m128d){__u, __a[1]};
  1706. }
  1707.  
  1708. /// Constructs a 128-bit floating-point vector of [2 x double] with
  1709. ///    unspecified content. This could be used as an argument to another
  1710. ///    intrinsic function where the argument is required but the value is not
  1711. ///    actually used.
  1712. ///
  1713. /// \headerfile <x86intrin.h>
  1714. ///
  1715. /// This intrinsic has no corresponding instruction.
  1716. ///
  1717. /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
  1718. ///    content.
  1719. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
  1720.   return (__m128d)__builtin_ia32_undef128();
  1721. }
  1722.  
  1723. /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
  1724. ///    64 bits of the vector are initialized with the specified double-precision
  1725. ///    floating-point value. The upper 64 bits are set to zero.
  1726. ///
  1727. /// \headerfile <x86intrin.h>
  1728. ///
  1729. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  1730. ///
  1731. /// \param __w
  1732. ///    A double-precision floating-point value used to initialize the lower 64
  1733. ///    bits of the result.
  1734. /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
  1735. ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
  1736. ///    set to zero.
  1737. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
  1738.   return __extension__(__m128d){__w, 0};
  1739. }
  1740.  
  1741. /// Constructs a 128-bit floating-point vector of [2 x double], with each
  1742. ///    of the two double-precision floating-point vector elements set to the
  1743. ///    specified double-precision floating-point value.
  1744. ///
  1745. /// \headerfile <x86intrin.h>
  1746. ///
  1747. /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
  1748. ///
  1749. /// \param __w
  1750. ///    A double-precision floating-point value used to initialize each vector
  1751. ///    element of the result.
  1752. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1753. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
  1754.   return __extension__(__m128d){__w, __w};
  1755. }
  1756.  
  1757. /// Constructs a 128-bit floating-point vector of [2 x double], with each
  1758. ///    of the two double-precision floating-point vector elements set to the
  1759. ///    specified double-precision floating-point value.
  1760. ///
  1761. /// \headerfile <x86intrin.h>
  1762. ///
  1763. /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
  1764. ///
  1765. /// \param __w
  1766. ///    A double-precision floating-point value used to initialize each vector
  1767. ///    element of the result.
  1768. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1769. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
  1770.   return _mm_set1_pd(__w);
  1771. }
  1772.  
  1773. /// Constructs a 128-bit floating-point vector of [2 x double]
  1774. ///    initialized with the specified double-precision floating-point values.
  1775. ///
  1776. /// \headerfile <x86intrin.h>
  1777. ///
  1778. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  1779. ///
  1780. /// \param __w
  1781. ///    A double-precision floating-point value used to initialize the upper 64
  1782. ///    bits of the result.
  1783. /// \param __x
  1784. ///    A double-precision floating-point value used to initialize the lower 64
  1785. ///    bits of the result.
  1786. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1787. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
  1788.                                                         double __x) {
  1789.   return __extension__(__m128d){__x, __w};
  1790. }
  1791.  
  1792. /// Constructs a 128-bit floating-point vector of [2 x double],
  1793. ///    initialized in reverse order with the specified double-precision
  1794. ///    floating-point values.
  1795. ///
  1796. /// \headerfile <x86intrin.h>
  1797. ///
  1798. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  1799. ///
  1800. /// \param __w
  1801. ///    A double-precision floating-point value used to initialize the lower 64
  1802. ///    bits of the result.
  1803. /// \param __x
  1804. ///    A double-precision floating-point value used to initialize the upper 64
  1805. ///    bits of the result.
  1806. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1807. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
  1808.                                                          double __x) {
  1809.   return __extension__(__m128d){__w, __x};
  1810. }
  1811.  
  1812. /// Constructs a 128-bit floating-point vector of [2 x double]
  1813. ///    initialized to zero.
  1814. ///
  1815. /// \headerfile <x86intrin.h>
  1816. ///
  1817. /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
  1818. ///
  1819. /// \returns An initialized 128-bit floating-point vector of [2 x double] with
  1820. ///    all elements set to zero.
  1821. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
  1822.   return __extension__(__m128d){0.0, 0.0};
  1823. }
  1824.  
  1825. /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
  1826. ///    64 bits are set to the lower 64 bits of the second parameter. The upper
  1827. ///    64 bits are set to the upper 64 bits of the first parameter.
  1828. ///
  1829. /// \headerfile <x86intrin.h>
  1830. ///
  1831. /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
  1832. ///
  1833. /// \param __a
  1834. ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
  1835. ///    upper 64 bits of the result.
  1836. /// \param __b
  1837. ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
  1838. ///    lower 64 bits of the result.
  1839. /// \returns A 128-bit vector of [2 x double] containing the moved values.
  1840. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
  1841.                                                          __m128d __b) {
  1842.   __a[0] = __b[0];
  1843.   return __a;
  1844. }
  1845.  
  1846. /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
  1847. ///    memory location.
  1848. ///
  1849. /// \headerfile <x86intrin.h>
  1850. ///
  1851. /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
  1852. ///
  1853. /// \param __dp
  1854. ///    A pointer to a 64-bit memory location.
  1855. /// \param __a
  1856. ///    A 128-bit vector of [2 x double] containing the value to be stored.
  1857. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
  1858.                                                        __m128d __a) {
  1859.   struct __mm_store_sd_struct {
  1860.     double __u;
  1861.   } __attribute__((__packed__, __may_alias__));
  1862.   ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
  1863. }
  1864.  
  1865. /// Moves packed double-precision values from a 128-bit vector of
  1866. ///    [2 x double] to a memory location.
  1867. ///
  1868. /// \headerfile <x86intrin.h>
  1869. ///
  1870. /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
  1871. ///
  1872. /// \param __dp
  1873. ///    A pointer to an aligned memory location that can store two
  1874. ///    double-precision values.
  1875. /// \param __a
  1876. ///    A packed 128-bit vector of [2 x double] containing the values to be
  1877. ///    moved.
  1878. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
  1879.                                                        __m128d __a) {
  1880.   *(__m128d *)__dp = __a;
  1881. }
  1882.  
  1883. /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
  1884. ///    the upper and lower 64 bits of a memory location.
  1885. ///
  1886. /// \headerfile <x86intrin.h>
  1887. ///
  1888. /// This intrinsic corresponds to the
  1889. ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
  1890. ///
  1891. /// \param __dp
  1892. ///    A pointer to a memory location that can store two double-precision
  1893. ///    values.
  1894. /// \param __a
  1895. ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
  1896. ///    of the values in \a __dp.
  1897. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
  1898.                                                         __m128d __a) {
  1899.   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
  1900.   _mm_store_pd(__dp, __a);
  1901. }
  1902.  
  1903. /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
  1904. ///    the upper and lower 64 bits of a memory location.
  1905. ///
  1906. /// \headerfile <x86intrin.h>
  1907. ///
  1908. /// This intrinsic corresponds to the
  1909. ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
  1910. ///
  1911. /// \param __dp
  1912. ///    A pointer to a memory location that can store two double-precision
  1913. ///    values.
  1914. /// \param __a
  1915. ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
  1916. ///    of the values in \a __dp.
  1917. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
  1918.                                                         __m128d __a) {
  1919.   _mm_store1_pd(__dp, __a);
  1920. }
  1921.  
  1922. /// Stores a 128-bit vector of [2 x double] into an unaligned memory
  1923. ///    location.
  1924. ///
  1925. /// \headerfile <x86intrin.h>
  1926. ///
  1927. /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
  1928. ///
  1929. /// \param __dp
  1930. ///    A pointer to a 128-bit memory location. The address of the memory
  1931. ///    location does not have to be aligned.
  1932. /// \param __a
  1933. ///    A 128-bit vector of [2 x double] containing the values to be stored.
  1934. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
  1935.                                                         __m128d __a) {
  1936.   struct __storeu_pd {
  1937.     __m128d_u __v;
  1938.   } __attribute__((__packed__, __may_alias__));
  1939.   ((struct __storeu_pd *)__dp)->__v = __a;
  1940. }
  1941.  
  1942. /// Stores two double-precision values, in reverse order, from a 128-bit
  1943. ///    vector of [2 x double] to a 16-byte aligned memory location.
  1944. ///
  1945. /// \headerfile <x86intrin.h>
  1946. ///
  1947. /// This intrinsic corresponds to a shuffling instruction followed by a
  1948. /// <c> VMOVAPD / MOVAPD </c> instruction.
  1949. ///
  1950. /// \param __dp
  1951. ///    A pointer to a 16-byte aligned memory location that can store two
  1952. ///    double-precision values.
  1953. /// \param __a
  1954. ///    A 128-bit vector of [2 x double] containing the values to be reversed and
  1955. ///    stored.
  1956. static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
  1957.                                                         __m128d __a) {
  1958.   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
  1959.   *(__m128d *)__dp = __a;
  1960. }
  1961.  
  1962. /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
  1963. ///    memory location.
  1964. ///
  1965. /// \headerfile <x86intrin.h>
  1966. ///
  1967. /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
  1968. ///
  1969. /// \param __dp
  1970. ///    A pointer to a 64-bit memory location.
  1971. /// \param __a
  1972. ///    A 128-bit vector of [2 x double] containing the value to be stored.
  1973. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
  1974.                                                         __m128d __a) {
  1975.   struct __mm_storeh_pd_struct {
  1976.     double __u;
  1977.   } __attribute__((__packed__, __may_alias__));
  1978.   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
  1979. }
  1980.  
  1981. /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
  1982. ///    memory location.
  1983. ///
  1984. /// \headerfile <x86intrin.h>
  1985. ///
  1986. /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
  1987. ///
  1988. /// \param __dp
  1989. ///    A pointer to a 64-bit memory location.
  1990. /// \param __a
  1991. ///    A 128-bit vector of [2 x double] containing the value to be stored.
  1992. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
  1993.                                                         __m128d __a) {
  1994.   struct __mm_storeh_pd_struct {
  1995.     double __u;
  1996.   } __attribute__((__packed__, __may_alias__));
  1997.   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
  1998. }
  1999.  
  2000. /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
  2001. ///    saving the lower 8 bits of each sum in the corresponding element of a
  2002. ///    128-bit result vector of [16 x i8].
  2003. ///
  2004. ///    The integer elements of both parameters can be either signed or unsigned.
  2005. ///
  2006. /// \headerfile <x86intrin.h>
  2007. ///
  2008. /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
  2009. ///
  2010. /// \param __a
  2011. ///    A 128-bit vector of [16 x i8].
  2012. /// \param __b
  2013. ///    A 128-bit vector of [16 x i8].
  2014. /// \returns A 128-bit vector of [16 x i8] containing the sums of both
  2015. ///    parameters.
  2016. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
  2017.                                                           __m128i __b) {
  2018.   return (__m128i)((__v16qu)__a + (__v16qu)__b);
  2019. }
  2020.  
  2021. /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
  2022. ///    saving the lower 16 bits of each sum in the corresponding element of a
  2023. ///    128-bit result vector of [8 x i16].
  2024. ///
  2025. ///    The integer elements of both parameters can be either signed or unsigned.
  2026. ///
  2027. /// \headerfile <x86intrin.h>
  2028. ///
  2029. /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
  2030. ///
  2031. /// \param __a
  2032. ///    A 128-bit vector of [8 x i16].
  2033. /// \param __b
  2034. ///    A 128-bit vector of [8 x i16].
  2035. /// \returns A 128-bit vector of [8 x i16] containing the sums of both
  2036. ///    parameters.
  2037. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
  2038.                                                            __m128i __b) {
  2039.   return (__m128i)((__v8hu)__a + (__v8hu)__b);
  2040. }
  2041.  
  2042. /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
  2043. ///    saving the lower 32 bits of each sum in the corresponding element of a
  2044. ///    128-bit result vector of [4 x i32].
  2045. ///
  2046. ///    The integer elements of both parameters can be either signed or unsigned.
  2047. ///
  2048. /// \headerfile <x86intrin.h>
  2049. ///
  2050. /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
  2051. ///
  2052. /// \param __a
  2053. ///    A 128-bit vector of [4 x i32].
  2054. /// \param __b
  2055. ///    A 128-bit vector of [4 x i32].
  2056. /// \returns A 128-bit vector of [4 x i32] containing the sums of both
  2057. ///    parameters.
  2058. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
  2059.                                                            __m128i __b) {
  2060.   return (__m128i)((__v4su)__a + (__v4su)__b);
  2061. }
  2062.  
  2063. /// Adds two signed or unsigned 64-bit integer values, returning the
  2064. ///    lower 64 bits of the sum.
  2065. ///
  2066. /// \headerfile <x86intrin.h>
  2067. ///
  2068. /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
  2069. ///
  2070. /// \param __a
  2071. ///    A 64-bit integer.
  2072. /// \param __b
  2073. ///    A 64-bit integer.
  2074. /// \returns A 64-bit integer containing the sum of both parameters.
  2075. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
  2076.                                                             __m64 __b) {
  2077.   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
  2078. }
  2079.  
  2080. /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
  2081. ///    saving the lower 64 bits of each sum in the corresponding element of a
  2082. ///    128-bit result vector of [2 x i64].
  2083. ///
  2084. ///    The integer elements of both parameters can be either signed or unsigned.
  2085. ///
  2086. /// \headerfile <x86intrin.h>
  2087. ///
  2088. /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
  2089. ///
  2090. /// \param __a
  2091. ///    A 128-bit vector of [2 x i64].
  2092. /// \param __b
  2093. ///    A 128-bit vector of [2 x i64].
  2094. /// \returns A 128-bit vector of [2 x i64] containing the sums of both
  2095. ///    parameters.
  2096. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
  2097.                                                            __m128i __b) {
  2098.   return (__m128i)((__v2du)__a + (__v2du)__b);
  2099. }
  2100.  
  2101. /// Adds, with saturation, the corresponding elements of two 128-bit
  2102. ///    signed [16 x i8] vectors, saving each sum in the corresponding element of
  2103. ///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
  2104. ///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
  2105. ///
  2106. /// \headerfile <x86intrin.h>
  2107. ///
  2108. /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
  2109. ///
  2110. /// \param __a
  2111. ///    A 128-bit signed [16 x i8] vector.
  2112. /// \param __b
  2113. ///    A 128-bit signed [16 x i8] vector.
  2114. /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
  2115. ///    both parameters.
  2116. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
  2117.                                                            __m128i __b) {
  2118.   return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
  2119. }
  2120.  
  2121. /// Adds, with saturation, the corresponding elements of two 128-bit
  2122. ///    signed [8 x i16] vectors, saving each sum in the corresponding element of
  2123. ///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
  2124. ///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
  2125. ///    0x8000.
  2126. ///
  2127. /// \headerfile <x86intrin.h>
  2128. ///
  2129. /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
  2130. ///
  2131. /// \param __a
  2132. ///    A 128-bit signed [8 x i16] vector.
  2133. /// \param __b
  2134. ///    A 128-bit signed [8 x i16] vector.
  2135. /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
  2136. ///    both parameters.
  2137. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
  2138.                                                             __m128i __b) {
  2139.   return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
  2140. }
  2141.  
  2142. /// Adds, with saturation, the corresponding elements of two 128-bit
  2143. ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
  2144. ///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
  2145. ///    are saturated to 0xFF. Negative sums are saturated to 0x00.
  2146. ///
  2147. /// \headerfile <x86intrin.h>
  2148. ///
  2149. /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
  2150. ///
  2151. /// \param __a
  2152. ///    A 128-bit unsigned [16 x i8] vector.
  2153. /// \param __b
  2154. ///    A 128-bit unsigned [16 x i8] vector.
  2155. /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
  2156. ///    of both parameters.
  2157. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
  2158.                                                            __m128i __b) {
  2159.   return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
  2160. }
  2161.  
  2162. /// Adds, with saturation, the corresponding elements of two 128-bit
  2163. ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
  2164. ///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
  2165. ///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
  2166. ///
  2167. /// \headerfile <x86intrin.h>
  2168. ///
  2169. /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
  2170. ///
  2171. /// \param __a
  2172. ///    A 128-bit unsigned [8 x i16] vector.
  2173. /// \param __b
  2174. ///    A 128-bit unsigned [8 x i16] vector.
  2175. /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
  2176. ///    of both parameters.
  2177. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
  2178.                                                             __m128i __b) {
  2179.   return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
  2180. }
  2181.  
  2182. /// Computes the rounded averages of corresponding elements of two
  2183. ///    128-bit unsigned [16 x i8] vectors, saving each result in the
  2184. ///    corresponding element of a 128-bit result vector of [16 x i8].
  2185. ///
  2186. /// \headerfile <x86intrin.h>
  2187. ///
  2188. /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
  2189. ///
  2190. /// \param __a
  2191. ///    A 128-bit unsigned [16 x i8] vector.
  2192. /// \param __b
  2193. ///    A 128-bit unsigned [16 x i8] vector.
  2194. /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
  2195. ///    averages of both parameters.
  2196. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
  2197.                                                           __m128i __b) {
  2198.   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
  2199. }
  2200.  
  2201. /// Computes the rounded averages of corresponding elements of two
  2202. ///    128-bit unsigned [8 x i16] vectors, saving each result in the
  2203. ///    corresponding element of a 128-bit result vector of [8 x i16].
  2204. ///
  2205. /// \headerfile <x86intrin.h>
  2206. ///
  2207. /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
  2208. ///
  2209. /// \param __a
  2210. ///    A 128-bit unsigned [8 x i16] vector.
  2211. /// \param __b
  2212. ///    A 128-bit unsigned [8 x i16] vector.
  2213. /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
  2214. ///    averages of both parameters.
  2215. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
  2216.                                                            __m128i __b) {
  2217.   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
  2218. }
  2219.  
  2220. /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
  2221. ///    vectors, producing eight intermediate 32-bit signed integer products, and
  2222. ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
  2223. ///    [4 x i32] vector.
  2224. ///
  2225. ///    For example, bits [15:0] of both parameters are multiplied producing a
  2226. ///    32-bit product, bits [31:16] of both parameters are multiplied producing
  2227. ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
  2228. ///    of the result.
  2229. ///
  2230. /// \headerfile <x86intrin.h>
  2231. ///
  2232. /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
  2233. ///
  2234. /// \param __a
  2235. ///    A 128-bit signed [8 x i16] vector.
  2236. /// \param __b
  2237. ///    A 128-bit signed [8 x i16] vector.
  2238. /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
  2239. ///    of both parameters.
  2240. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
  2241.                                                             __m128i __b) {
  2242.   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
  2243. }
  2244.  
  2245. /// Compares corresponding elements of two 128-bit signed [8 x i16]
  2246. ///    vectors, saving the greater value from each comparison in the
  2247. ///    corresponding element of a 128-bit result vector of [8 x i16].
  2248. ///
  2249. /// \headerfile <x86intrin.h>
  2250. ///
  2251. /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
  2252. ///
  2253. /// \param __a
  2254. ///    A 128-bit signed [8 x i16] vector.
  2255. /// \param __b
  2256. ///    A 128-bit signed [8 x i16] vector.
  2257. /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
  2258. ///    each comparison.
  2259. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
  2260.                                                            __m128i __b) {
  2261.   return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
  2262. }
  2263.  
  2264. /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
  2265. ///    vectors, saving the greater value from each comparison in the
  2266. ///    corresponding element of a 128-bit result vector of [16 x i8].
  2267. ///
  2268. /// \headerfile <x86intrin.h>
  2269. ///
  2270. /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
  2271. ///
  2272. /// \param __a
  2273. ///    A 128-bit unsigned [16 x i8] vector.
  2274. /// \param __b
  2275. ///    A 128-bit unsigned [16 x i8] vector.
  2276. /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
  2277. ///    each comparison.
  2278. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
  2279.                                                           __m128i __b) {
  2280.   return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
  2281. }
  2282.  
  2283. /// Compares corresponding elements of two 128-bit signed [8 x i16]
  2284. ///    vectors, saving the smaller value from each comparison in the
  2285. ///    corresponding element of a 128-bit result vector of [8 x i16].
  2286. ///
  2287. /// \headerfile <x86intrin.h>
  2288. ///
  2289. /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
  2290. ///
  2291. /// \param __a
  2292. ///    A 128-bit signed [8 x i16] vector.
  2293. /// \param __b
  2294. ///    A 128-bit signed [8 x i16] vector.
  2295. /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
  2296. ///    each comparison.
  2297. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
  2298.                                                            __m128i __b) {
  2299.   return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
  2300. }
  2301.  
  2302. /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
  2303. ///    vectors, saving the smaller value from each comparison in the
  2304. ///    corresponding element of a 128-bit result vector of [16 x i8].
  2305. ///
  2306. /// \headerfile <x86intrin.h>
  2307. ///
  2308. /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
  2309. ///
  2310. /// \param __a
  2311. ///    A 128-bit unsigned [16 x i8] vector.
  2312. /// \param __b
  2313. ///    A 128-bit unsigned [16 x i8] vector.
  2314. /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
  2315. ///    each comparison.
  2316. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
  2317.                                                           __m128i __b) {
  2318.   return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
  2319. }
  2320.  
  2321. /// Multiplies the corresponding elements of two signed [8 x i16]
  2322. ///    vectors, saving the upper 16 bits of each 32-bit product in the
  2323. ///    corresponding element of a 128-bit signed [8 x i16] result vector.
  2324. ///
  2325. /// \headerfile <x86intrin.h>
  2326. ///
  2327. /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
  2328. ///
  2329. /// \param __a
  2330. ///    A 128-bit signed [8 x i16] vector.
  2331. /// \param __b
  2332. ///    A 128-bit signed [8 x i16] vector.
  2333. /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
  2334. ///    each of the eight 32-bit products.
  2335. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
  2336.                                                              __m128i __b) {
  2337.   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
  2338. }
  2339.  
  2340. /// Multiplies the corresponding elements of two unsigned [8 x i16]
  2341. ///    vectors, saving the upper 16 bits of each 32-bit product in the
  2342. ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
  2343. ///
  2344. /// \headerfile <x86intrin.h>
  2345. ///
  2346. /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
  2347. ///
  2348. /// \param __a
  2349. ///    A 128-bit unsigned [8 x i16] vector.
  2350. /// \param __b
  2351. ///    A 128-bit unsigned [8 x i16] vector.
  2352. /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
  2353. ///    of each of the eight 32-bit products.
  2354. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
  2355.                                                              __m128i __b) {
  2356.   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
  2357. }
  2358.  
  2359. /// Multiplies the corresponding elements of two signed [8 x i16]
  2360. ///    vectors, saving the lower 16 bits of each 32-bit product in the
  2361. ///    corresponding element of a 128-bit signed [8 x i16] result vector.
  2362. ///
  2363. /// \headerfile <x86intrin.h>
  2364. ///
  2365. /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
  2366. ///
  2367. /// \param __a
  2368. ///    A 128-bit signed [8 x i16] vector.
  2369. /// \param __b
  2370. ///    A 128-bit signed [8 x i16] vector.
  2371. /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
  2372. ///    each of the eight 32-bit products.
  2373. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
  2374.                                                              __m128i __b) {
  2375.   return (__m128i)((__v8hu)__a * (__v8hu)__b);
  2376. }
  2377.  
  2378. /// Multiplies 32-bit unsigned integer values contained in the lower bits
  2379. ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
  2380. ///    product.
  2381. ///
  2382. /// \headerfile <x86intrin.h>
  2383. ///
  2384. /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
  2385. ///
  2386. /// \param __a
  2387. ///    A 64-bit integer containing one of the source operands.
  2388. /// \param __b
  2389. ///    A 64-bit integer containing one of the source operands.
  2390. /// \returns A 64-bit integer vector containing the product of both operands.
  2391. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
  2392.                                                             __m64 __b) {
  2393.   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
  2394. }
  2395.  
  2396. /// Multiplies 32-bit unsigned integer values contained in the lower
  2397. ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
  2398. ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
  2399. ///
  2400. /// \headerfile <x86intrin.h>
  2401. ///
  2402. /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
  2403. ///
  2404. /// \param __a
  2405. ///    A [2 x i64] vector containing one of the source operands.
  2406. /// \param __b
  2407. ///    A [2 x i64] vector containing one of the source operands.
  2408. /// \returns A [2 x i64] vector containing the product of both operands.
  2409. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
  2410.                                                            __m128i __b) {
  2411.   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
  2412. }
  2413.  
  2414. /// Computes the absolute differences of corresponding 8-bit integer
  2415. ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
  2416. ///    separately sums the second 8 absolute differences. Packs these two
  2417. ///    unsigned 16-bit integer sums into the upper and lower elements of a
  2418. ///    [2 x i64] vector.
  2419. ///
  2420. /// \headerfile <x86intrin.h>
  2421. ///
  2422. /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
  2423. ///
  2424. /// \param __a
  2425. ///    A 128-bit integer vector containing one of the source operands.
  2426. /// \param __b
  2427. ///    A 128-bit integer vector containing one of the source operands.
  2428. /// \returns A [2 x i64] vector containing the sums of the sets of absolute
  2429. ///    differences between both operands.
  2430. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
  2431.                                                           __m128i __b) {
  2432.   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
  2433. }
  2434.  
  2435. /// Subtracts the corresponding 8-bit integer values in the operands.
  2436. ///
  2437. /// \headerfile <x86intrin.h>
  2438. ///
  2439. /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
  2440. ///
  2441. /// \param __a
  2442. ///    A 128-bit integer vector containing the minuends.
  2443. /// \param __b
  2444. ///    A 128-bit integer vector containing the subtrahends.
  2445. /// \returns A 128-bit integer vector containing the differences of the values
  2446. ///    in the operands.
  2447. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
  2448.                                                           __m128i __b) {
  2449.   return (__m128i)((__v16qu)__a - (__v16qu)__b);
  2450. }
  2451.  
  2452. /// Subtracts the corresponding 16-bit integer values in the operands.
  2453. ///
  2454. /// \headerfile <x86intrin.h>
  2455. ///
  2456. /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
  2457. ///
  2458. /// \param __a
  2459. ///    A 128-bit integer vector containing the minuends.
  2460. /// \param __b
  2461. ///    A 128-bit integer vector containing the subtrahends.
  2462. /// \returns A 128-bit integer vector containing the differences of the values
  2463. ///    in the operands.
  2464. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
  2465.                                                            __m128i __b) {
  2466.   return (__m128i)((__v8hu)__a - (__v8hu)__b);
  2467. }
  2468.  
  2469. /// Subtracts the corresponding 32-bit integer values in the operands.
  2470. ///
  2471. /// \headerfile <x86intrin.h>
  2472. ///
  2473. /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
  2474. ///
  2475. /// \param __a
  2476. ///    A 128-bit integer vector containing the minuends.
  2477. /// \param __b
  2478. ///    A 128-bit integer vector containing the subtrahends.
  2479. /// \returns A 128-bit integer vector containing the differences of the values
  2480. ///    in the operands.
  2481. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
  2482.                                                            __m128i __b) {
  2483.   return (__m128i)((__v4su)__a - (__v4su)__b);
  2484. }
  2485.  
  2486. /// Subtracts signed or unsigned 64-bit integer values and writes the
  2487. ///    difference to the corresponding bits in the destination.
  2488. ///
  2489. /// \headerfile <x86intrin.h>
  2490. ///
  2491. /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
  2492. ///
  2493. /// \param __a
  2494. ///    A 64-bit integer vector containing the minuend.
  2495. /// \param __b
  2496. ///    A 64-bit integer vector containing the subtrahend.
  2497. /// \returns A 64-bit integer vector containing the difference of the values in
  2498. ///    the operands.
  2499. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
  2500.                                                             __m64 __b) {
  2501.   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
  2502. }
  2503.  
  2504. /// Subtracts the corresponding elements of two [2 x i64] vectors.
  2505. ///
  2506. /// \headerfile <x86intrin.h>
  2507. ///
  2508. /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
  2509. ///
  2510. /// \param __a
  2511. ///    A 128-bit integer vector containing the minuends.
  2512. /// \param __b
  2513. ///    A 128-bit integer vector containing the subtrahends.
  2514. /// \returns A 128-bit integer vector containing the differences of the values
  2515. ///    in the operands.
  2516. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
  2517.                                                            __m128i __b) {
  2518.   return (__m128i)((__v2du)__a - (__v2du)__b);
  2519. }
  2520.  
  2521. /// Subtracts corresponding 8-bit signed integer values in the input and
  2522. ///    returns the differences in the corresponding bytes in the destination.
  2523. ///    Differences greater than 0x7F are saturated to 0x7F, and differences less
  2524. ///    than 0x80 are saturated to 0x80.
  2525. ///
  2526. /// \headerfile <x86intrin.h>
  2527. ///
  2528. /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
  2529. ///
  2530. /// \param __a
  2531. ///    A 128-bit integer vector containing the minuends.
  2532. /// \param __b
  2533. ///    A 128-bit integer vector containing the subtrahends.
  2534. /// \returns A 128-bit integer vector containing the differences of the values
  2535. ///    in the operands.
  2536. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
  2537.                                                            __m128i __b) {
  2538.   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
  2539. }
  2540.  
  2541. /// Subtracts corresponding 16-bit signed integer values in the input and
  2542. ///    returns the differences in the corresponding bytes in the destination.
  2543. ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
  2544. ///    than 0x8000 are saturated to 0x8000.
  2545. ///
  2546. /// \headerfile <x86intrin.h>
  2547. ///
  2548. /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
  2549. ///
  2550. /// \param __a
  2551. ///    A 128-bit integer vector containing the minuends.
  2552. /// \param __b
  2553. ///    A 128-bit integer vector containing the subtrahends.
  2554. /// \returns A 128-bit integer vector containing the differences of the values
  2555. ///    in the operands.
  2556. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
  2557.                                                             __m128i __b) {
  2558.   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
  2559. }
  2560.  
  2561. /// Subtracts corresponding 8-bit unsigned integer values in the input
  2562. ///    and returns the differences in the corresponding bytes in the
  2563. ///    destination. Differences less than 0x00 are saturated to 0x00.
  2564. ///
  2565. /// \headerfile <x86intrin.h>
  2566. ///
  2567. /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
  2568. ///
  2569. /// \param __a
  2570. ///    A 128-bit integer vector containing the minuends.
  2571. /// \param __b
  2572. ///    A 128-bit integer vector containing the subtrahends.
  2573. /// \returns A 128-bit integer vector containing the unsigned integer
  2574. ///    differences of the values in the operands.
  2575. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
  2576.                                                            __m128i __b) {
  2577.   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
  2578. }
  2579.  
  2580. /// Subtracts corresponding 16-bit unsigned integer values in the input
  2581. ///    and returns the differences in the corresponding bytes in the
  2582. ///    destination. Differences less than 0x0000 are saturated to 0x0000.
  2583. ///
  2584. /// \headerfile <x86intrin.h>
  2585. ///
  2586. /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
  2587. ///
  2588. /// \param __a
  2589. ///    A 128-bit integer vector containing the minuends.
  2590. /// \param __b
  2591. ///    A 128-bit integer vector containing the subtrahends.
  2592. /// \returns A 128-bit integer vector containing the unsigned integer
  2593. ///    differences of the values in the operands.
  2594. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
  2595.                                                             __m128i __b) {
  2596.   return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
  2597. }
  2598.  
  2599. /// Performs a bitwise AND of two 128-bit integer vectors.
  2600. ///
  2601. /// \headerfile <x86intrin.h>
  2602. ///
  2603. /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
  2604. ///
  2605. /// \param __a
  2606. ///    A 128-bit integer vector containing one of the source operands.
  2607. /// \param __b
  2608. ///    A 128-bit integer vector containing one of the source operands.
  2609. /// \returns A 128-bit integer vector containing the bitwise AND of the values
  2610. ///    in both operands.
  2611. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
  2612.                                                            __m128i __b) {
  2613.   return (__m128i)((__v2du)__a & (__v2du)__b);
  2614. }
  2615.  
  2616. /// Performs a bitwise AND of two 128-bit integer vectors, using the
  2617. ///    one's complement of the values contained in the first source operand.
  2618. ///
  2619. /// \headerfile <x86intrin.h>
  2620. ///
  2621. /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
  2622. ///
  2623. /// \param __a
  2624. ///    A 128-bit vector containing the left source operand. The one's complement
  2625. ///    of this value is used in the bitwise AND.
  2626. /// \param __b
  2627. ///    A 128-bit vector containing the right source operand.
  2628. /// \returns A 128-bit integer vector containing the bitwise AND of the one's
  2629. ///    complement of the first operand and the values in the second operand.
  2630. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
  2631.                                                               __m128i __b) {
  2632.   return (__m128i)(~(__v2du)__a & (__v2du)__b);
  2633. }
  2634. /// Performs a bitwise OR of two 128-bit integer vectors.
  2635. ///
  2636. /// \headerfile <x86intrin.h>
  2637. ///
  2638. /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
  2639. ///
  2640. /// \param __a
  2641. ///    A 128-bit integer vector containing one of the source operands.
  2642. /// \param __b
  2643. ///    A 128-bit integer vector containing one of the source operands.
  2644. /// \returns A 128-bit integer vector containing the bitwise OR of the values
  2645. ///    in both operands.
  2646. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
  2647.                                                           __m128i __b) {
  2648.   return (__m128i)((__v2du)__a | (__v2du)__b);
  2649. }
  2650.  
  2651. /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
  2652. ///
  2653. /// \headerfile <x86intrin.h>
  2654. ///
  2655. /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
  2656. ///
  2657. /// \param __a
  2658. ///    A 128-bit integer vector containing one of the source operands.
  2659. /// \param __b
  2660. ///    A 128-bit integer vector containing one of the source operands.
  2661. /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
  2662. ///    values in both operands.
  2663. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
  2664.                                                            __m128i __b) {
  2665.   return (__m128i)((__v2du)__a ^ (__v2du)__b);
  2666. }
  2667.  
  2668. /// Left-shifts the 128-bit integer vector operand by the specified
  2669. ///    number of bytes. Low-order bits are cleared.
  2670. ///
  2671. /// \headerfile <x86intrin.h>
  2672. ///
  2673. /// \code
  2674. /// __m128i _mm_slli_si128(__m128i a, const int imm);
  2675. /// \endcode
  2676. ///
  2677. /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
  2678. ///
  2679. /// \param a
  2680. ///    A 128-bit integer vector containing the source operand.
  2681. /// \param imm
  2682. ///    An immediate value specifying the number of bytes to left-shift operand
  2683. ///    \a a.
  2684. /// \returns A 128-bit integer vector containing the left-shifted value.
  2685. #define _mm_slli_si128(a, imm)                                                 \
  2686.   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
  2687.                                                 (int)(imm)))
  2688.  
  2689. #define _mm_bslli_si128(a, imm)                                                \
  2690.   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
  2691.                                                 (int)(imm)))
  2692.  
  2693. /// Left-shifts each 16-bit value in the 128-bit integer vector operand
  2694. ///    by the specified number of bits. Low-order bits are cleared.
  2695. ///
  2696. /// \headerfile <x86intrin.h>
  2697. ///
  2698. /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
  2699. ///
  2700. /// \param __a
  2701. ///    A 128-bit integer vector containing the source operand.
  2702. /// \param __count
  2703. ///    An integer value specifying the number of bits to left-shift each value
  2704. ///    in operand \a __a.
  2705. /// \returns A 128-bit integer vector containing the left-shifted values.
  2706. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
  2707.                                                             int __count) {
  2708.   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
  2709. }
  2710.  
  2711. /// Left-shifts each 16-bit value in the 128-bit integer vector operand
  2712. ///    by the specified number of bits. Low-order bits are cleared.
  2713. ///
  2714. /// \headerfile <x86intrin.h>
  2715. ///
  2716. /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
  2717. ///
  2718. /// \param __a
  2719. ///    A 128-bit integer vector containing the source operand.
  2720. /// \param __count
  2721. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2722. ///    to left-shift each value in operand \a __a.
  2723. /// \returns A 128-bit integer vector containing the left-shifted values.
  2724. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
  2725.                                                            __m128i __count) {
  2726.   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
  2727. }
  2728.  
  2729. /// Left-shifts each 32-bit value in the 128-bit integer vector operand
  2730. ///    by the specified number of bits. Low-order bits are cleared.
  2731. ///
  2732. /// \headerfile <x86intrin.h>
  2733. ///
  2734. /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
  2735. ///
  2736. /// \param __a
  2737. ///    A 128-bit integer vector containing the source operand.
  2738. /// \param __count
  2739. ///    An integer value specifying the number of bits to left-shift each value
  2740. ///    in operand \a __a.
  2741. /// \returns A 128-bit integer vector containing the left-shifted values.
  2742. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
  2743.                                                             int __count) {
  2744.   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
  2745. }
  2746.  
  2747. /// Left-shifts each 32-bit value in the 128-bit integer vector operand
  2748. ///    by the specified number of bits. Low-order bits are cleared.
  2749. ///
  2750. /// \headerfile <x86intrin.h>
  2751. ///
  2752. /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
  2753. ///
  2754. /// \param __a
  2755. ///    A 128-bit integer vector containing the source operand.
  2756. /// \param __count
  2757. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2758. ///    to left-shift each value in operand \a __a.
  2759. /// \returns A 128-bit integer vector containing the left-shifted values.
  2760. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
  2761.                                                            __m128i __count) {
  2762.   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
  2763. }
  2764.  
  2765. /// Left-shifts each 64-bit value in the 128-bit integer vector operand
  2766. ///    by the specified number of bits. Low-order bits are cleared.
  2767. ///
  2768. /// \headerfile <x86intrin.h>
  2769. ///
  2770. /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
  2771. ///
  2772. /// \param __a
  2773. ///    A 128-bit integer vector containing the source operand.
  2774. /// \param __count
  2775. ///    An integer value specifying the number of bits to left-shift each value
  2776. ///    in operand \a __a.
  2777. /// \returns A 128-bit integer vector containing the left-shifted values.
  2778. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
  2779.                                                             int __count) {
  2780.   return __builtin_ia32_psllqi128((__v2di)__a, __count);
  2781. }
  2782.  
  2783. /// Left-shifts each 64-bit value in the 128-bit integer vector operand
  2784. ///    by the specified number of bits. Low-order bits are cleared.
  2785. ///
  2786. /// \headerfile <x86intrin.h>
  2787. ///
  2788. /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
  2789. ///
  2790. /// \param __a
  2791. ///    A 128-bit integer vector containing the source operand.
  2792. /// \param __count
  2793. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2794. ///    to left-shift each value in operand \a __a.
  2795. /// \returns A 128-bit integer vector containing the left-shifted values.
  2796. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
  2797.                                                            __m128i __count) {
  2798.   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
  2799. }
  2800.  
  2801. /// Right-shifts each 16-bit value in the 128-bit integer vector operand
  2802. ///    by the specified number of bits. High-order bits are filled with the sign
  2803. ///    bit of the initial value.
  2804. ///
  2805. /// \headerfile <x86intrin.h>
  2806. ///
  2807. /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
  2808. ///
  2809. /// \param __a
  2810. ///    A 128-bit integer vector containing the source operand.
  2811. /// \param __count
  2812. ///    An integer value specifying the number of bits to right-shift each value
  2813. ///    in operand \a __a.
  2814. /// \returns A 128-bit integer vector containing the right-shifted values.
  2815. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
  2816.                                                             int __count) {
  2817.   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
  2818. }
  2819.  
  2820. /// Right-shifts each 16-bit value in the 128-bit integer vector operand
  2821. ///    by the specified number of bits. High-order bits are filled with the sign
  2822. ///    bit of the initial value.
  2823. ///
  2824. /// \headerfile <x86intrin.h>
  2825. ///
  2826. /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
  2827. ///
  2828. /// \param __a
  2829. ///    A 128-bit integer vector containing the source operand.
  2830. /// \param __count
  2831. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2832. ///    to right-shift each value in operand \a __a.
  2833. /// \returns A 128-bit integer vector containing the right-shifted values.
  2834. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
  2835.                                                            __m128i __count) {
  2836.   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
  2837. }
  2838.  
  2839. /// Right-shifts each 32-bit value in the 128-bit integer vector operand
  2840. ///    by the specified number of bits. High-order bits are filled with the sign
  2841. ///    bit of the initial value.
  2842. ///
  2843. /// \headerfile <x86intrin.h>
  2844. ///
  2845. /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
  2846. ///
  2847. /// \param __a
  2848. ///    A 128-bit integer vector containing the source operand.
  2849. /// \param __count
  2850. ///    An integer value specifying the number of bits to right-shift each value
  2851. ///    in operand \a __a.
  2852. /// \returns A 128-bit integer vector containing the right-shifted values.
  2853. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
  2854.                                                             int __count) {
  2855.   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
  2856. }
  2857.  
  2858. /// Right-shifts each 32-bit value in the 128-bit integer vector operand
  2859. ///    by the specified number of bits. High-order bits are filled with the sign
  2860. ///    bit of the initial value.
  2861. ///
  2862. /// \headerfile <x86intrin.h>
  2863. ///
  2864. /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
  2865. ///
  2866. /// \param __a
  2867. ///    A 128-bit integer vector containing the source operand.
  2868. /// \param __count
  2869. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2870. ///    to right-shift each value in operand \a __a.
  2871. /// \returns A 128-bit integer vector containing the right-shifted values.
  2872. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
  2873.                                                            __m128i __count) {
  2874.   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
  2875. }
  2876.  
  2877. /// Right-shifts the 128-bit integer vector operand by the specified
  2878. ///    number of bytes. High-order bits are cleared.
  2879. ///
  2880. /// \headerfile <x86intrin.h>
  2881. ///
  2882. /// \code
  2883. /// __m128i _mm_srli_si128(__m128i a, const int imm);
  2884. /// \endcode
  2885. ///
  2886. /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
  2887. ///
  2888. /// \param a
  2889. ///    A 128-bit integer vector containing the source operand.
  2890. /// \param imm
  2891. ///    An immediate value specifying the number of bytes to right-shift operand
  2892. ///    \a a.
  2893. /// \returns A 128-bit integer vector containing the right-shifted value.
  2894. #define _mm_srli_si128(a, imm)                                                 \
  2895.   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
  2896.                                                 (int)(imm)))
  2897.  
  2898. #define _mm_bsrli_si128(a, imm)                                                \
  2899.   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
  2900.                                                 (int)(imm)))
  2901.  
  2902. /// Right-shifts each of 16-bit values in the 128-bit integer vector
  2903. ///    operand by the specified number of bits. High-order bits are cleared.
  2904. ///
  2905. /// \headerfile <x86intrin.h>
  2906. ///
  2907. /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
  2908. ///
  2909. /// \param __a
  2910. ///    A 128-bit integer vector containing the source operand.
  2911. /// \param __count
  2912. ///    An integer value specifying the number of bits to right-shift each value
  2913. ///    in operand \a __a.
  2914. /// \returns A 128-bit integer vector containing the right-shifted values.
  2915. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
  2916.                                                             int __count) {
  2917.   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
  2918. }
  2919.  
  2920. /// Right-shifts each of 16-bit values in the 128-bit integer vector
  2921. ///    operand by the specified number of bits. High-order bits are cleared.
  2922. ///
  2923. /// \headerfile <x86intrin.h>
  2924. ///
  2925. /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
  2926. ///
  2927. /// \param __a
  2928. ///    A 128-bit integer vector containing the source operand.
  2929. /// \param __count
  2930. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2931. ///    to right-shift each value in operand \a __a.
  2932. /// \returns A 128-bit integer vector containing the right-shifted values.
  2933. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
  2934.                                                            __m128i __count) {
  2935.   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
  2936. }
  2937.  
  2938. /// Right-shifts each of 32-bit values in the 128-bit integer vector
  2939. ///    operand by the specified number of bits. High-order bits are cleared.
  2940. ///
  2941. /// \headerfile <x86intrin.h>
  2942. ///
  2943. /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
  2944. ///
  2945. /// \param __a
  2946. ///    A 128-bit integer vector containing the source operand.
  2947. /// \param __count
  2948. ///    An integer value specifying the number of bits to right-shift each value
  2949. ///    in operand \a __a.
  2950. /// \returns A 128-bit integer vector containing the right-shifted values.
  2951. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
  2952.                                                             int __count) {
  2953.   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
  2954. }
  2955.  
  2956. /// Right-shifts each of 32-bit values in the 128-bit integer vector
  2957. ///    operand by the specified number of bits. High-order bits are cleared.
  2958. ///
  2959. /// \headerfile <x86intrin.h>
  2960. ///
  2961. /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
  2962. ///
  2963. /// \param __a
  2964. ///    A 128-bit integer vector containing the source operand.
  2965. /// \param __count
  2966. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  2967. ///    to right-shift each value in operand \a __a.
  2968. /// \returns A 128-bit integer vector containing the right-shifted values.
  2969. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
  2970.                                                            __m128i __count) {
  2971.   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
  2972. }
  2973.  
  2974. /// Right-shifts each of 64-bit values in the 128-bit integer vector
  2975. ///    operand by the specified number of bits. High-order bits are cleared.
  2976. ///
  2977. /// \headerfile <x86intrin.h>
  2978. ///
  2979. /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
  2980. ///
  2981. /// \param __a
  2982. ///    A 128-bit integer vector containing the source operand.
  2983. /// \param __count
  2984. ///    An integer value specifying the number of bits to right-shift each value
  2985. ///    in operand \a __a.
  2986. /// \returns A 128-bit integer vector containing the right-shifted values.
  2987. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
  2988.                                                             int __count) {
  2989.   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
  2990. }
  2991.  
  2992. /// Right-shifts each of 64-bit values in the 128-bit integer vector
  2993. ///    operand by the specified number of bits. High-order bits are cleared.
  2994. ///
  2995. /// \headerfile <x86intrin.h>
  2996. ///
  2997. /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
  2998. ///
  2999. /// \param __a
  3000. ///    A 128-bit integer vector containing the source operand.
  3001. /// \param __count
  3002. ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
  3003. ///    to right-shift each value in operand \a __a.
  3004. /// \returns A 128-bit integer vector containing the right-shifted values.
  3005. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
  3006.                                                            __m128i __count) {
  3007.   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
  3008. }
  3009.  
  3010. /// Compares each of the corresponding 8-bit values of the 128-bit
  3011. ///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
  3012. ///    for true.
  3013. ///
  3014. /// \headerfile <x86intrin.h>
  3015. ///
  3016. /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
  3017. ///
  3018. /// \param __a
  3019. ///    A 128-bit integer vector.
  3020. /// \param __b
  3021. ///    A 128-bit integer vector.
  3022. /// \returns A 128-bit integer vector containing the comparison results.
  3023. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
  3024.                                                             __m128i __b) {
  3025.   return (__m128i)((__v16qi)__a == (__v16qi)__b);
  3026. }
  3027.  
  3028. /// Compares each of the corresponding 16-bit values of the 128-bit
  3029. ///    integer vectors for equality. Each comparison yields 0x0 for false,
  3030. ///    0xFFFF for true.
  3031. ///
  3032. /// \headerfile <x86intrin.h>
  3033. ///
  3034. /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
  3035. ///
  3036. /// \param __a
  3037. ///    A 128-bit integer vector.
  3038. /// \param __b
  3039. ///    A 128-bit integer vector.
  3040. /// \returns A 128-bit integer vector containing the comparison results.
  3041. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
  3042.                                                              __m128i __b) {
  3043.   return (__m128i)((__v8hi)__a == (__v8hi)__b);
  3044. }
  3045.  
  3046. /// Compares each of the corresponding 32-bit values of the 128-bit
  3047. ///    integer vectors for equality. Each comparison yields 0x0 for false,
  3048. ///    0xFFFFFFFF for true.
  3049. ///
  3050. /// \headerfile <x86intrin.h>
  3051. ///
  3052. /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
  3053. ///
  3054. /// \param __a
  3055. ///    A 128-bit integer vector.
  3056. /// \param __b
  3057. ///    A 128-bit integer vector.
  3058. /// \returns A 128-bit integer vector containing the comparison results.
  3059. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
  3060.                                                              __m128i __b) {
  3061.   return (__m128i)((__v4si)__a == (__v4si)__b);
  3062. }
  3063.  
  3064. /// Compares each of the corresponding signed 8-bit values of the 128-bit
  3065. ///    integer vectors to determine if the values in the first operand are
  3066. ///    greater than those in the second operand. Each comparison yields 0x0 for
  3067. ///    false, 0xFF for true.
  3068. ///
  3069. /// \headerfile <x86intrin.h>
  3070. ///
  3071. /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
  3072. ///
  3073. /// \param __a
  3074. ///    A 128-bit integer vector.
  3075. /// \param __b
  3076. ///    A 128-bit integer vector.
  3077. /// \returns A 128-bit integer vector containing the comparison results.
  3078. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
  3079.                                                             __m128i __b) {
  3080.   /* This function always performs a signed comparison, but __v16qi is a char
  3081.      which may be signed or unsigned, so use __v16qs. */
  3082.   return (__m128i)((__v16qs)__a > (__v16qs)__b);
  3083. }
  3084.  
  3085. /// Compares each of the corresponding signed 16-bit values of the
  3086. ///    128-bit integer vectors to determine if the values in the first operand
  3087. ///    are greater than those in the second operand.
  3088. ///
  3089. ///    Each comparison yields 0x0 for false, 0xFFFF for true.
  3090. ///
  3091. /// \headerfile <x86intrin.h>
  3092. ///
  3093. /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
  3094. ///
  3095. /// \param __a
  3096. ///    A 128-bit integer vector.
  3097. /// \param __b
  3098. ///    A 128-bit integer vector.
  3099. /// \returns A 128-bit integer vector containing the comparison results.
  3100. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
  3101.                                                              __m128i __b) {
  3102.   return (__m128i)((__v8hi)__a > (__v8hi)__b);
  3103. }
  3104.  
  3105. /// Compares each of the corresponding signed 32-bit values of the
  3106. ///    128-bit integer vectors to determine if the values in the first operand
  3107. ///    are greater than those in the second operand.
  3108. ///
  3109. ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
  3110. ///
  3111. /// \headerfile <x86intrin.h>
  3112. ///
  3113. /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
  3114. ///
  3115. /// \param __a
  3116. ///    A 128-bit integer vector.
  3117. /// \param __b
  3118. ///    A 128-bit integer vector.
  3119. /// \returns A 128-bit integer vector containing the comparison results.
  3120. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
  3121.                                                              __m128i __b) {
  3122.   return (__m128i)((__v4si)__a > (__v4si)__b);
  3123. }
  3124.  
  3125. /// Compares each of the corresponding signed 8-bit values of the 128-bit
  3126. ///    integer vectors to determine if the values in the first operand are less
  3127. ///    than those in the second operand.
  3128. ///
  3129. ///    Each comparison yields 0x0 for false, 0xFF for true.
  3130. ///
  3131. /// \headerfile <x86intrin.h>
  3132. ///
  3133. /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
  3134. ///
  3135. /// \param __a
  3136. ///    A 128-bit integer vector.
  3137. /// \param __b
  3138. ///    A 128-bit integer vector.
  3139. /// \returns A 128-bit integer vector containing the comparison results.
  3140. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
  3141.                                                             __m128i __b) {
  3142.   return _mm_cmpgt_epi8(__b, __a);
  3143. }
  3144.  
  3145. /// Compares each of the corresponding signed 16-bit values of the
  3146. ///    128-bit integer vectors to determine if the values in the first operand
  3147. ///    are less than those in the second operand.
  3148. ///
  3149. ///    Each comparison yields 0x0 for false, 0xFFFF for true.
  3150. ///
  3151. /// \headerfile <x86intrin.h>
  3152. ///
  3153. /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
  3154. ///
  3155. /// \param __a
  3156. ///    A 128-bit integer vector.
  3157. /// \param __b
  3158. ///    A 128-bit integer vector.
  3159. /// \returns A 128-bit integer vector containing the comparison results.
  3160. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
  3161.                                                              __m128i __b) {
  3162.   return _mm_cmpgt_epi16(__b, __a);
  3163. }
  3164.  
  3165. /// Compares each of the corresponding signed 32-bit values of the
  3166. ///    128-bit integer vectors to determine if the values in the first operand
  3167. ///    are less than those in the second operand.
  3168. ///
  3169. ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
  3170. ///
  3171. /// \headerfile <x86intrin.h>
  3172. ///
  3173. /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
  3174. ///
  3175. /// \param __a
  3176. ///    A 128-bit integer vector.
  3177. /// \param __b
  3178. ///    A 128-bit integer vector.
  3179. /// \returns A 128-bit integer vector containing the comparison results.
  3180. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
  3181.                                                              __m128i __b) {
  3182.   return _mm_cmpgt_epi32(__b, __a);
  3183. }
  3184.  
  3185. #ifdef __x86_64__
  3186. /// Converts a 64-bit signed integer value from the second operand into a
  3187. ///    double-precision value and returns it in the lower element of a [2 x
  3188. ///    double] vector; the upper element of the returned vector is copied from
  3189. ///    the upper element of the first operand.
  3190. ///
  3191. /// \headerfile <x86intrin.h>
  3192. ///
  3193. /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
  3194. ///
  3195. /// \param __a
  3196. ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
  3197. ///    copied to the upper 64 bits of the destination.
  3198. /// \param __b
  3199. ///    A 64-bit signed integer operand containing the value to be converted.
  3200. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  3201. ///    converted value of the second operand. The upper 64 bits are copied from
  3202. ///    the upper 64 bits of the first operand.
  3203. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
  3204.                                                             long long __b) {
  3205.   __a[0] = __b;
  3206.   return __a;
  3207. }
  3208.  
  3209. /// Converts the first (lower) element of a vector of [2 x double] into a
  3210. ///    64-bit signed integer value, according to the current rounding mode.
  3211. ///
  3212. /// \headerfile <x86intrin.h>
  3213. ///
  3214. /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
  3215. ///
  3216. /// \param __a
  3217. ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  3218. ///    conversion.
  3219. /// \returns A 64-bit signed integer containing the converted value.
  3220. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
  3221.   return __builtin_ia32_cvtsd2si64((__v2df)__a);
  3222. }
  3223.  
  3224. /// Converts the first (lower) element of a vector of [2 x double] into a
  3225. ///    64-bit signed integer value, truncating the result when it is inexact.
  3226. ///
  3227. /// \headerfile <x86intrin.h>
  3228. ///
  3229. /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
  3230. ///   instruction.
  3231. ///
  3232. /// \param __a
  3233. ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  3234. ///    conversion.
  3235. /// \returns A 64-bit signed integer containing the converted value.
  3236. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
  3237.   return __builtin_ia32_cvttsd2si64((__v2df)__a);
  3238. }
  3239. #endif
  3240.  
  3241. /// Converts a vector of [4 x i32] into a vector of [4 x float].
  3242. ///
  3243. /// \headerfile <x86intrin.h>
  3244. ///
  3245. /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
  3246. ///
  3247. /// \param __a
  3248. ///    A 128-bit integer vector.
  3249. /// \returns A 128-bit vector of [4 x float] containing the converted values.
  3250. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
  3251.   return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
  3252. }
  3253.  
  3254. /// Converts a vector of [4 x float] into a vector of [4 x i32].
  3255. ///
  3256. /// \headerfile <x86intrin.h>
  3257. ///
  3258. /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
  3259. ///
  3260. /// \param __a
  3261. ///    A 128-bit vector of [4 x float].
  3262. /// \returns A 128-bit integer vector of [4 x i32] containing the converted
  3263. ///    values.
  3264. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
  3265.   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
  3266. }
  3267.  
  3268. /// Converts a vector of [4 x float] into a vector of [4 x i32],
  3269. ///    truncating the result when it is inexact.
  3270. ///
  3271. /// \headerfile <x86intrin.h>
  3272. ///
  3273. /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
  3274. ///   instruction.
  3275. ///
  3276. /// \param __a
  3277. ///    A 128-bit vector of [4 x float].
  3278. /// \returns A 128-bit vector of [4 x i32] containing the converted values.
  3279. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
  3280.   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
  3281. }
  3282.  
  3283. /// Returns a vector of [4 x i32] where the lowest element is the input
  3284. ///    operand and the remaining elements are zero.
  3285. ///
  3286. /// \headerfile <x86intrin.h>
  3287. ///
  3288. /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
  3289. ///
  3290. /// \param __a
  3291. ///    A 32-bit signed integer operand.
  3292. /// \returns A 128-bit vector of [4 x i32].
  3293. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
  3294.   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
  3295. }
  3296.  
  3297. /// Returns a vector of [2 x i64] where the lower element is the input
  3298. ///    operand and the upper element is zero.
  3299. ///
  3300. /// \headerfile <x86intrin.h>
  3301. ///
  3302. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
  3303. /// in 64-bit mode.
  3304. ///
  3305. /// \param __a
  3306. ///    A 64-bit signed integer operand containing the value to be converted.
  3307. /// \returns A 128-bit vector of [2 x i64] containing the converted value.
  3308. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
  3309.   return __extension__(__m128i)(__v2di){__a, 0};
  3310. }
  3311.  
  3312. /// Moves the least significant 32 bits of a vector of [4 x i32] to a
  3313. ///    32-bit signed integer value.
  3314. ///
  3315. /// \headerfile <x86intrin.h>
  3316. ///
  3317. /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
  3318. ///
  3319. /// \param __a
  3320. ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
  3321. ///    destination.
  3322. /// \returns A 32-bit signed integer containing the moved value.
  3323. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
  3324.   __v4si __b = (__v4si)__a;
  3325.   return __b[0];
  3326. }
  3327.  
  3328. /// Moves the least significant 64 bits of a vector of [2 x i64] to a
  3329. ///    64-bit signed integer value.
  3330. ///
  3331. /// \headerfile <x86intrin.h>
  3332. ///
  3333. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  3334. ///
  3335. /// \param __a
  3336. ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
  3337. ///    destination.
  3338. /// \returns A 64-bit signed integer containing the moved value.
  3339. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
  3340.   return __a[0];
  3341. }
  3342.  
  3343. /// Moves packed integer values from an aligned 128-bit memory location
  3344. ///    to elements in a 128-bit integer vector.
  3345. ///
  3346. /// \headerfile <x86intrin.h>
  3347. ///
  3348. /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
  3349. ///
  3350. /// \param __p
  3351. ///    An aligned pointer to a memory location containing integer values.
  3352. /// \returns A 128-bit integer vector containing the moved values.
  3353. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3354. _mm_load_si128(__m128i const *__p) {
  3355.   return *__p;
  3356. }
  3357.  
  3358. /// Moves packed integer values from an unaligned 128-bit memory location
  3359. ///    to elements in a 128-bit integer vector.
  3360. ///
  3361. /// \headerfile <x86intrin.h>
  3362. ///
  3363. /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
  3364. ///
  3365. /// \param __p
  3366. ///    A pointer to a memory location containing integer values.
  3367. /// \returns A 128-bit integer vector containing the moved values.
  3368. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3369. _mm_loadu_si128(__m128i_u const *__p) {
  3370.   struct __loadu_si128 {
  3371.     __m128i_u __v;
  3372.   } __attribute__((__packed__, __may_alias__));
  3373.   return ((const struct __loadu_si128 *)__p)->__v;
  3374. }
  3375.  
  3376. /// Returns a vector of [2 x i64] where the lower element is taken from
  3377. ///    the lower element of the operand, and the upper element is zero.
  3378. ///
  3379. /// \headerfile <x86intrin.h>
  3380. ///
  3381. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  3382. ///
  3383. /// \param __p
  3384. ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
  3385. ///    the destination.
  3386. /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
  3387. ///    moved value. The higher order bits are cleared.
  3388. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3389. _mm_loadl_epi64(__m128i_u const *__p) {
  3390.   struct __mm_loadl_epi64_struct {
  3391.     long long __u;
  3392.   } __attribute__((__packed__, __may_alias__));
  3393.   return __extension__(__m128i){
  3394.       ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
  3395. }
  3396.  
  3397. /// Generates a 128-bit vector of [4 x i32] with unspecified content.
  3398. ///    This could be used as an argument to another intrinsic function where the
  3399. ///    argument is required but the value is not actually used.
  3400. ///
  3401. /// \headerfile <x86intrin.h>
  3402. ///
  3403. /// This intrinsic has no corresponding instruction.
  3404. ///
  3405. /// \returns A 128-bit vector of [4 x i32] with unspecified content.
  3406. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
  3407.   return (__m128i)__builtin_ia32_undef128();
  3408. }
  3409.  
  3410. /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
  3411. ///    the specified 64-bit integer values.
  3412. ///
  3413. /// \headerfile <x86intrin.h>
  3414. ///
  3415. /// This intrinsic is a utility function and does not correspond to a specific
  3416. ///    instruction.
  3417. ///
  3418. /// \param __q1
  3419. ///    A 64-bit integer value used to initialize the upper 64 bits of the
  3420. ///    destination vector of [2 x i64].
  3421. /// \param __q0
  3422. ///    A 64-bit integer value used to initialize the lower 64 bits of the
  3423. ///    destination vector of [2 x i64].
  3424. /// \returns An initialized 128-bit vector of [2 x i64] containing the values
  3425. ///    provided in the operands.
  3426. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
  3427.                                                             long long __q0) {
  3428.   return __extension__(__m128i)(__v2di){__q0, __q1};
  3429. }
  3430.  
  3431. /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
  3432. ///    the specified 64-bit integer values.
  3433. ///
  3434. /// \headerfile <x86intrin.h>
  3435. ///
  3436. /// This intrinsic is a utility function and does not correspond to a specific
  3437. ///    instruction.
  3438. ///
  3439. /// \param __q1
  3440. ///    A 64-bit integer value used to initialize the upper 64 bits of the
  3441. ///    destination vector of [2 x i64].
  3442. /// \param __q0
  3443. ///    A 64-bit integer value used to initialize the lower 64 bits of the
  3444. ///    destination vector of [2 x i64].
  3445. /// \returns An initialized 128-bit vector of [2 x i64] containing the values
  3446. ///    provided in the operands.
  3447. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
  3448.                                                            __m64 __q0) {
  3449.   return _mm_set_epi64x((long long)__q1, (long long)__q0);
  3450. }
  3451.  
  3452. /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
  3453. ///    the specified 32-bit integer values.
  3454. ///
  3455. /// \headerfile <x86intrin.h>
  3456. ///
  3457. /// This intrinsic is a utility function and does not correspond to a specific
  3458. ///    instruction.
  3459. ///
  3460. /// \param __i3
  3461. ///    A 32-bit integer value used to initialize bits [127:96] of the
  3462. ///    destination vector.
  3463. /// \param __i2
  3464. ///    A 32-bit integer value used to initialize bits [95:64] of the destination
  3465. ///    vector.
  3466. /// \param __i1
  3467. ///    A 32-bit integer value used to initialize bits [63:32] of the destination
  3468. ///    vector.
  3469. /// \param __i0
  3470. ///    A 32-bit integer value used to initialize bits [31:0] of the destination
  3471. ///    vector.
  3472. /// \returns An initialized 128-bit vector of [4 x i32] containing the values
  3473. ///    provided in the operands.
  3474. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
  3475.                                                            int __i1, int __i0) {
  3476.   return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
  3477. }
  3478.  
  3479. /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
  3480. ///    the specified 16-bit integer values.
  3481. ///
  3482. /// \headerfile <x86intrin.h>
  3483. ///
  3484. /// This intrinsic is a utility function and does not correspond to a specific
  3485. ///    instruction.
  3486. ///
  3487. /// \param __w7
  3488. ///    A 16-bit integer value used to initialize bits [127:112] of the
  3489. ///    destination vector.
  3490. /// \param __w6
  3491. ///    A 16-bit integer value used to initialize bits [111:96] of the
  3492. ///    destination vector.
  3493. /// \param __w5
  3494. ///    A 16-bit integer value used to initialize bits [95:80] of the destination
  3495. ///    vector.
  3496. /// \param __w4
  3497. ///    A 16-bit integer value used to initialize bits [79:64] of the destination
  3498. ///    vector.
  3499. /// \param __w3
  3500. ///    A 16-bit integer value used to initialize bits [63:48] of the destination
  3501. ///    vector.
  3502. /// \param __w2
  3503. ///    A 16-bit integer value used to initialize bits [47:32] of the destination
  3504. ///    vector.
  3505. /// \param __w1
  3506. ///    A 16-bit integer value used to initialize bits [31:16] of the destination
  3507. ///    vector.
  3508. /// \param __w0
  3509. ///    A 16-bit integer value used to initialize bits [15:0] of the destination
  3510. ///    vector.
  3511. /// \returns An initialized 128-bit vector of [8 x i16] containing the values
  3512. ///    provided in the operands.
  3513. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3514. _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
  3515.               short __w2, short __w1, short __w0) {
  3516.   return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
  3517.                                         __w4, __w5, __w6, __w7};
  3518. }
  3519.  
  3520. /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
  3521. ///    the specified 8-bit integer values.
  3522. ///
  3523. /// \headerfile <x86intrin.h>
  3524. ///
  3525. /// This intrinsic is a utility function and does not correspond to a specific
  3526. ///    instruction.
  3527. ///
  3528. /// \param __b15
  3529. ///    Initializes bits [127:120] of the destination vector.
  3530. /// \param __b14
  3531. ///    Initializes bits [119:112] of the destination vector.
  3532. /// \param __b13
  3533. ///    Initializes bits [111:104] of the destination vector.
  3534. /// \param __b12
  3535. ///    Initializes bits [103:96] of the destination vector.
  3536. /// \param __b11
  3537. ///    Initializes bits [95:88] of the destination vector.
  3538. /// \param __b10
  3539. ///    Initializes bits [87:80] of the destination vector.
  3540. /// \param __b9
  3541. ///    Initializes bits [79:72] of the destination vector.
  3542. /// \param __b8
  3543. ///    Initializes bits [71:64] of the destination vector.
  3544. /// \param __b7
  3545. ///    Initializes bits [63:56] of the destination vector.
  3546. /// \param __b6
  3547. ///    Initializes bits [55:48] of the destination vector.
  3548. /// \param __b5
  3549. ///    Initializes bits [47:40] of the destination vector.
  3550. /// \param __b4
  3551. ///    Initializes bits [39:32] of the destination vector.
  3552. /// \param __b3
  3553. ///    Initializes bits [31:24] of the destination vector.
  3554. /// \param __b2
  3555. ///    Initializes bits [23:16] of the destination vector.
  3556. /// \param __b1
  3557. ///    Initializes bits [15:8] of the destination vector.
  3558. /// \param __b0
  3559. ///    Initializes bits [7:0] of the destination vector.
  3560. /// \returns An initialized 128-bit vector of [16 x i8] containing the values
  3561. ///    provided in the operands.
  3562. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3563. _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
  3564.              char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
  3565.              char __b4, char __b3, char __b2, char __b1, char __b0) {
  3566.   return __extension__(__m128i)(__v16qi){
  3567.       __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
  3568.       __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
  3569. }
  3570.  
  3571. /// Initializes both values in a 128-bit integer vector with the
  3572. ///    specified 64-bit integer value.
  3573. ///
  3574. /// \headerfile <x86intrin.h>
  3575. ///
  3576. /// This intrinsic is a utility function and does not correspond to a specific
  3577. ///    instruction.
  3578. ///
  3579. /// \param __q
  3580. ///    Integer value used to initialize the elements of the destination integer
  3581. ///    vector.
  3582. /// \returns An initialized 128-bit integer vector of [2 x i64] with both
  3583. ///    elements containing the value provided in the operand.
  3584. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
  3585.   return _mm_set_epi64x(__q, __q);
  3586. }
  3587.  
  3588. /// Initializes both values in a 128-bit vector of [2 x i64] with the
  3589. ///    specified 64-bit value.
  3590. ///
  3591. /// \headerfile <x86intrin.h>
  3592. ///
  3593. /// This intrinsic is a utility function and does not correspond to a specific
  3594. ///    instruction.
  3595. ///
  3596. /// \param __q
  3597. ///    A 64-bit value used to initialize the elements of the destination integer
  3598. ///    vector.
  3599. /// \returns An initialized 128-bit vector of [2 x i64] with all elements
  3600. ///    containing the value provided in the operand.
  3601. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
  3602.   return _mm_set_epi64(__q, __q);
  3603. }
  3604.  
  3605. /// Initializes all values in a 128-bit vector of [4 x i32] with the
  3606. ///    specified 32-bit value.
  3607. ///
  3608. /// \headerfile <x86intrin.h>
  3609. ///
  3610. /// This intrinsic is a utility function and does not correspond to a specific
  3611. ///    instruction.
  3612. ///
  3613. /// \param __i
  3614. ///    A 32-bit value used to initialize the elements of the destination integer
  3615. ///    vector.
  3616. /// \returns An initialized 128-bit vector of [4 x i32] with all elements
  3617. ///    containing the value provided in the operand.
  3618. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
  3619.   return _mm_set_epi32(__i, __i, __i, __i);
  3620. }
  3621.  
  3622. /// Initializes all values in a 128-bit vector of [8 x i16] with the
  3623. ///    specified 16-bit value.
  3624. ///
  3625. /// \headerfile <x86intrin.h>
  3626. ///
  3627. /// This intrinsic is a utility function and does not correspond to a specific
  3628. ///    instruction.
  3629. ///
  3630. /// \param __w
  3631. ///    A 16-bit value used to initialize the elements of the destination integer
  3632. ///    vector.
  3633. /// \returns An initialized 128-bit vector of [8 x i16] with all elements
  3634. ///    containing the value provided in the operand.
  3635. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
  3636.   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
  3637. }
  3638.  
  3639. /// Initializes all values in a 128-bit vector of [16 x i8] with the
  3640. ///    specified 8-bit value.
  3641. ///
  3642. /// \headerfile <x86intrin.h>
  3643. ///
  3644. /// This intrinsic is a utility function and does not correspond to a specific
  3645. ///    instruction.
  3646. ///
  3647. /// \param __b
  3648. ///    An 8-bit value used to initialize the elements of the destination integer
  3649. ///    vector.
  3650. /// \returns An initialized 128-bit vector of [16 x i8] with all elements
  3651. ///    containing the value provided in the operand.
  3652. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
  3653.   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
  3654.                       __b, __b, __b, __b, __b);
  3655. }
  3656.  
  3657. /// Constructs a 128-bit integer vector, initialized in reverse order
  3658. ///     with the specified 64-bit integral values.
  3659. ///
  3660. /// \headerfile <x86intrin.h>
  3661. ///
  3662. /// This intrinsic does not correspond to a specific instruction.
  3663. ///
  3664. /// \param __q0
  3665. ///    A 64-bit integral value used to initialize the lower 64 bits of the
  3666. ///    result.
  3667. /// \param __q1
  3668. ///    A 64-bit integral value used to initialize the upper 64 bits of the
  3669. ///    result.
  3670. /// \returns An initialized 128-bit integer vector.
  3671. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
  3672.                                                             __m64 __q1) {
  3673.   return _mm_set_epi64(__q1, __q0);
  3674. }
  3675.  
  3676. /// Constructs a 128-bit integer vector, initialized in reverse order
  3677. ///     with the specified 32-bit integral values.
  3678. ///
  3679. /// \headerfile <x86intrin.h>
  3680. ///
  3681. /// This intrinsic is a utility function and does not correspond to a specific
  3682. ///    instruction.
  3683. ///
  3684. /// \param __i0
  3685. ///    A 32-bit integral value used to initialize bits [31:0] of the result.
  3686. /// \param __i1
  3687. ///    A 32-bit integral value used to initialize bits [63:32] of the result.
  3688. /// \param __i2
  3689. ///    A 32-bit integral value used to initialize bits [95:64] of the result.
  3690. /// \param __i3
  3691. ///    A 32-bit integral value used to initialize bits [127:96] of the result.
  3692. /// \returns An initialized 128-bit integer vector.
  3693. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
  3694.                                                             int __i2,
  3695.                                                             int __i3) {
  3696.   return _mm_set_epi32(__i3, __i2, __i1, __i0);
  3697. }
  3698.  
  3699. /// Constructs a 128-bit integer vector, initialized in reverse order
  3700. ///     with the specified 16-bit integral values.
  3701. ///
  3702. /// \headerfile <x86intrin.h>
  3703. ///
  3704. /// This intrinsic is a utility function and does not correspond to a specific
  3705. ///    instruction.
  3706. ///
  3707. /// \param __w0
  3708. ///    A 16-bit integral value used to initialize bits [15:0] of the result.
  3709. /// \param __w1
  3710. ///    A 16-bit integral value used to initialize bits [31:16] of the result.
  3711. /// \param __w2
  3712. ///    A 16-bit integral value used to initialize bits [47:32] of the result.
  3713. /// \param __w3
  3714. ///    A 16-bit integral value used to initialize bits [63:48] of the result.
  3715. /// \param __w4
  3716. ///    A 16-bit integral value used to initialize bits [79:64] of the result.
  3717. /// \param __w5
  3718. ///    A 16-bit integral value used to initialize bits [95:80] of the result.
  3719. /// \param __w6
  3720. ///    A 16-bit integral value used to initialize bits [111:96] of the result.
  3721. /// \param __w7
  3722. ///    A 16-bit integral value used to initialize bits [127:112] of the result.
  3723. /// \returns An initialized 128-bit integer vector.
  3724. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3725. _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
  3726.                short __w5, short __w6, short __w7) {
  3727.   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
  3728. }
  3729.  
  3730. /// Constructs a 128-bit integer vector, initialized in reverse order
  3731. ///     with the specified 8-bit integral values.
  3732. ///
  3733. /// \headerfile <x86intrin.h>
  3734. ///
  3735. /// This intrinsic is a utility function and does not correspond to a specific
  3736. ///    instruction.
  3737. ///
  3738. /// \param __b0
  3739. ///    An 8-bit integral value used to initialize bits [7:0] of the result.
  3740. /// \param __b1
  3741. ///    An 8-bit integral value used to initialize bits [15:8] of the result.
  3742. /// \param __b2
  3743. ///    An 8-bit integral value used to initialize bits [23:16] of the result.
  3744. /// \param __b3
  3745. ///    An 8-bit integral value used to initialize bits [31:24] of the result.
  3746. /// \param __b4
  3747. ///    An 8-bit integral value used to initialize bits [39:32] of the result.
  3748. /// \param __b5
  3749. ///    An 8-bit integral value used to initialize bits [47:40] of the result.
  3750. /// \param __b6
  3751. ///    An 8-bit integral value used to initialize bits [55:48] of the result.
  3752. /// \param __b7
  3753. ///    An 8-bit integral value used to initialize bits [63:56] of the result.
  3754. /// \param __b8
  3755. ///    An 8-bit integral value used to initialize bits [71:64] of the result.
  3756. /// \param __b9
  3757. ///    An 8-bit integral value used to initialize bits [79:72] of the result.
  3758. /// \param __b10
  3759. ///    An 8-bit integral value used to initialize bits [87:80] of the result.
  3760. /// \param __b11
  3761. ///    An 8-bit integral value used to initialize bits [95:88] of the result.
  3762. /// \param __b12
  3763. ///    An 8-bit integral value used to initialize bits [103:96] of the result.
  3764. /// \param __b13
  3765. ///    An 8-bit integral value used to initialize bits [111:104] of the result.
  3766. /// \param __b14
  3767. ///    An 8-bit integral value used to initialize bits [119:112] of the result.
  3768. /// \param __b15
  3769. ///    An 8-bit integral value used to initialize bits [127:120] of the result.
  3770. /// \returns An initialized 128-bit integer vector.
  3771. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3772. _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
  3773.               char __b6, char __b7, char __b8, char __b9, char __b10,
  3774.               char __b11, char __b12, char __b13, char __b14, char __b15) {
  3775.   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
  3776.                       __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
  3777. }
  3778.  
  3779. /// Creates a 128-bit integer vector initialized to zero.
  3780. ///
  3781. /// \headerfile <x86intrin.h>
  3782. ///
  3783. /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
  3784. ///
  3785. /// \returns An initialized 128-bit integer vector with all elements set to
  3786. ///    zero.
  3787. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
  3788.   return __extension__(__m128i)(__v2di){0LL, 0LL};
  3789. }
  3790.  
  3791. /// Stores a 128-bit integer vector to a memory location aligned on a
  3792. ///    128-bit boundary.
  3793. ///
  3794. /// \headerfile <x86intrin.h>
  3795. ///
  3796. /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
  3797. ///
  3798. /// \param __p
  3799. ///    A pointer to an aligned memory location that will receive the integer
  3800. ///    values.
  3801. /// \param __b
  3802. ///    A 128-bit integer vector containing the values to be moved.
  3803. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
  3804.                                                           __m128i __b) {
  3805.   *__p = __b;
  3806. }
  3807.  
  3808. /// Stores a 128-bit integer vector to an unaligned memory location.
  3809. ///
  3810. /// \headerfile <x86intrin.h>
  3811. ///
  3812. /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
  3813. ///
  3814. /// \param __p
  3815. ///    A pointer to a memory location that will receive the integer values.
  3816. /// \param __b
  3817. ///    A 128-bit integer vector containing the values to be moved.
  3818. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
  3819.                                                            __m128i __b) {
  3820.   struct __storeu_si128 {
  3821.     __m128i_u __v;
  3822.   } __attribute__((__packed__, __may_alias__));
  3823.   ((struct __storeu_si128 *)__p)->__v = __b;
  3824. }
  3825.  
  3826. /// Stores a 64-bit integer value from the low element of a 128-bit integer
  3827. ///    vector.
  3828. ///
  3829. /// \headerfile <x86intrin.h>
  3830. ///
  3831. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  3832. ///
  3833. /// \param __p
  3834. ///    A pointer to a 64-bit memory location. The address of the memory
  3835. ///    location does not have to be aligned.
  3836. /// \param __b
  3837. ///    A 128-bit integer vector containing the value to be stored.
  3838. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
  3839.                                                           __m128i __b) {
  3840.   struct __storeu_si64 {
  3841.     long long __v;
  3842.   } __attribute__((__packed__, __may_alias__));
  3843.   ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
  3844. }
  3845.  
  3846. /// Stores a 32-bit integer value from the low element of a 128-bit integer
  3847. ///    vector.
  3848. ///
  3849. /// \headerfile <x86intrin.h>
  3850. ///
  3851. /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
  3852. ///
  3853. /// \param __p
  3854. ///    A pointer to a 32-bit memory location. The address of the memory
  3855. ///    location does not have to be aligned.
  3856. /// \param __b
  3857. ///    A 128-bit integer vector containing the value to be stored.
  3858. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
  3859.                                                           __m128i __b) {
  3860.   struct __storeu_si32 {
  3861.     int __v;
  3862.   } __attribute__((__packed__, __may_alias__));
  3863.   ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
  3864. }
  3865.  
  3866. /// Stores a 16-bit integer value from the low element of a 128-bit integer
  3867. ///    vector.
  3868. ///
  3869. /// \headerfile <x86intrin.h>
  3870. ///
  3871. /// This intrinsic does not correspond to a specific instruction.
  3872. ///
  3873. /// \param __p
  3874. ///    A pointer to a 16-bit memory location. The address of the memory
  3875. ///    location does not have to be aligned.
  3876. /// \param __b
  3877. ///    A 128-bit integer vector containing the value to be stored.
  3878. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
  3879.                                                           __m128i __b) {
  3880.   struct __storeu_si16 {
  3881.     short __v;
  3882.   } __attribute__((__packed__, __may_alias__));
  3883.   ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
  3884. }
  3885.  
  3886. /// Moves bytes selected by the mask from the first operand to the
  3887. ///    specified unaligned memory location. When a mask bit is 1, the
  3888. ///    corresponding byte is written, otherwise it is not written.
  3889. ///
  3890. ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
  3891. ///    used again soon). Exception and trap behavior for elements not selected
  3892. ///    for storage to memory are implementation dependent.
  3893. ///
  3894. /// \headerfile <x86intrin.h>
  3895. ///
  3896. /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
  3897. ///   instruction.
  3898. ///
  3899. /// \param __d
  3900. ///    A 128-bit integer vector containing the values to be moved.
  3901. /// \param __n
  3902. ///    A 128-bit integer vector containing the mask. The most significant bit of
  3903. ///    each byte represents the mask bits.
  3904. /// \param __p
  3905. ///    A pointer to an unaligned 128-bit memory location where the specified
  3906. ///    values are moved.
  3907. static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
  3908.                                                               __m128i __n,
  3909.                                                               char *__p) {
  3910.   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
  3911. }
  3912.  
  3913. /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
  3914. ///    a memory location.
  3915. ///
  3916. /// \headerfile <x86intrin.h>
  3917. ///
  3918. /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
  3919. ///
  3920. /// \param __p
  3921. ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
  3922. ///    of the integer vector parameter.
  3923. /// \param __a
  3924. ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
  3925. ///    value to be stored.
  3926. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
  3927.                                                            __m128i __a) {
  3928.   struct __mm_storel_epi64_struct {
  3929.     long long __u;
  3930.   } __attribute__((__packed__, __may_alias__));
  3931.   ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
  3932. }
  3933.  
  3934. /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
  3935. ///    aligned memory location.
  3936. ///
  3937. ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
  3938. ///    used again soon).
  3939. ///
  3940. /// \headerfile <x86intrin.h>
  3941. ///
  3942. /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
  3943. ///
  3944. /// \param __p
  3945. ///    A pointer to the 128-bit aligned memory location used to store the value.
  3946. /// \param __a
  3947. ///    A vector of [2 x double] containing the 64-bit values to be stored.
  3948. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
  3949.                                                         __m128d __a) {
  3950.   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
  3951. }
  3952.  
  3953. /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
  3954. ///
  3955. ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
  3956. ///    used again soon).
  3957. ///
  3958. /// \headerfile <x86intrin.h>
  3959. ///
  3960. /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
  3961. ///
  3962. /// \param __p
  3963. ///    A pointer to the 128-bit aligned memory location used to store the value.
  3964. /// \param __a
  3965. ///    A 128-bit integer vector containing the values to be stored.
  3966. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
  3967.                                                            __m128i __a) {
  3968.   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
  3969. }
  3970.  
  3971. /// Stores a 32-bit integer value in the specified memory location.
  3972. ///
  3973. ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
  3974. ///    used again soon).
  3975. ///
  3976. /// \headerfile <x86intrin.h>
  3977. ///
  3978. /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
  3979. ///
  3980. /// \param __p
  3981. ///    A pointer to the 32-bit memory location used to store the value.
  3982. /// \param __a
  3983. ///    A 32-bit integer containing the value to be stored.
  3984. static __inline__ void
  3985.     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
  3986.     _mm_stream_si32(int *__p, int __a) {
  3987.   __builtin_ia32_movnti(__p, __a);
  3988. }
  3989.  
  3990. #ifdef __x86_64__
  3991. /// Stores a 64-bit integer value in the specified memory location.
  3992. ///
  3993. ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
  3994. ///    used again soon).
  3995. ///
  3996. /// \headerfile <x86intrin.h>
  3997. ///
  3998. /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
  3999. ///
  4000. /// \param __p
  4001. ///    A pointer to the 64-bit memory location used to store the value.
  4002. /// \param __a
  4003. ///    A 64-bit integer containing the value to be stored.
  4004. static __inline__ void
  4005.     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
  4006.     _mm_stream_si64(long long *__p, long long __a) {
  4007.   __builtin_ia32_movnti64(__p, __a);
  4008. }
  4009. #endif
  4010.  
  4011. #if defined(__cplusplus)
  4012. extern "C" {
  4013. #endif
  4014.  
  4015. /// The cache line containing \a __p is flushed and invalidated from all
  4016. ///    caches in the coherency domain.
  4017. ///
  4018. /// \headerfile <x86intrin.h>
  4019. ///
  4020. /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
  4021. ///
  4022. /// \param __p
  4023. ///    A pointer to the memory location used to identify the cache line to be
  4024. ///    flushed.
  4025. void _mm_clflush(void const *__p);
  4026.  
  4027. /// Forces strong memory ordering (serialization) between load
  4028. ///    instructions preceding this instruction and load instructions following
  4029. ///    this instruction, ensuring the system completes all previous loads before
  4030. ///    executing subsequent loads.
  4031. ///
  4032. /// \headerfile <x86intrin.h>
  4033. ///
  4034. /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
  4035. ///
  4036. void _mm_lfence(void);
  4037.  
  4038. /// Forces strong memory ordering (serialization) between load and store
  4039. ///    instructions preceding this instruction and load and store instructions
  4040. ///    following this instruction, ensuring that the system completes all
  4041. ///    previous memory accesses before executing subsequent memory accesses.
  4042. ///
  4043. /// \headerfile <x86intrin.h>
  4044. ///
  4045. /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
  4046. ///
  4047. void _mm_mfence(void);
  4048.  
  4049. #if defined(__cplusplus)
  4050. } // extern "C"
  4051. #endif
  4052.  
  4053. /// Converts 16-bit signed integers from both 128-bit integer vector
  4054. ///    operands into 8-bit signed integers, and packs the results into the
  4055. ///    destination. Positive values greater than 0x7F are saturated to 0x7F.
  4056. ///    Negative values less than 0x80 are saturated to 0x80.
  4057. ///
  4058. /// \headerfile <x86intrin.h>
  4059. ///
  4060. /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
  4061. ///
  4062. /// \param __a
  4063. ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4064. ///   a signed integer and is converted to a 8-bit signed integer with
  4065. ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
  4066. ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
  4067. ///   written to the lower 64 bits of the result.
  4068. /// \param __b
  4069. ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4070. ///   a signed integer and is converted to a 8-bit signed integer with
  4071. ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
  4072. ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
  4073. ///   written to the higher 64 bits of the result.
  4074. /// \returns A 128-bit vector of [16 x i8] containing the converted values.
  4075. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
  4076.                                                              __m128i __b) {
  4077.   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
  4078. }
  4079.  
  4080. /// Converts 32-bit signed integers from both 128-bit integer vector
  4081. ///    operands into 16-bit signed integers, and packs the results into the
  4082. ///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
  4083. ///    Negative values less than 0x8000 are saturated to 0x8000.
  4084. ///
  4085. /// \headerfile <x86intrin.h>
  4086. ///
  4087. /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
  4088. ///
  4089. /// \param __a
  4090. ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
  4091. ///    a signed integer and is converted to a 16-bit signed integer with
  4092. ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
  4093. ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
  4094. ///    are written to the lower 64 bits of the result.
  4095. /// \param __b
  4096. ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
  4097. ///    a signed integer and is converted to a 16-bit signed integer with
  4098. ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
  4099. ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
  4100. ///    are written to the higher 64 bits of the result.
  4101. /// \returns A 128-bit vector of [8 x i16] containing the converted values.
  4102. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
  4103.                                                              __m128i __b) {
  4104.   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
  4105. }
  4106.  
  4107. /// Converts 16-bit signed integers from both 128-bit integer vector
  4108. ///    operands into 8-bit unsigned integers, and packs the results into the
  4109. ///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
  4110. ///    than 0x00 are saturated to 0x00.
  4111. ///
  4112. /// \headerfile <x86intrin.h>
  4113. ///
  4114. /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
  4115. ///
  4116. /// \param __a
  4117. ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4118. ///    a signed integer and is converted to an 8-bit unsigned integer with
  4119. ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
  4120. ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
  4121. ///    written to the lower 64 bits of the result.
  4122. /// \param __b
  4123. ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4124. ///    a signed integer and is converted to an 8-bit unsigned integer with
  4125. ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
  4126. ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
  4127. ///    written to the higher 64 bits of the result.
  4128. /// \returns A 128-bit vector of [16 x i8] containing the converted values.
  4129. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
  4130.                                                               __m128i __b) {
  4131.   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
  4132. }
  4133.  
  4134. /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
  4135. ///    the immediate-value parameter as a selector.
  4136. ///
  4137. /// \headerfile <x86intrin.h>
  4138. ///
  4139. /// \code
  4140. /// __m128i _mm_extract_epi16(__m128i a, const int imm);
  4141. /// \endcode
  4142. ///
  4143. /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
  4144. ///
  4145. /// \param a
  4146. ///    A 128-bit integer vector.
  4147. /// \param imm
  4148. ///    An immediate value. Bits [2:0] selects values from \a a to be assigned
  4149. ///    to bits[15:0] of the result. \n
  4150. ///    000: assign values from bits [15:0] of \a a. \n
  4151. ///    001: assign values from bits [31:16] of \a a. \n
  4152. ///    010: assign values from bits [47:32] of \a a. \n
  4153. ///    011: assign values from bits [63:48] of \a a. \n
  4154. ///    100: assign values from bits [79:64] of \a a. \n
  4155. ///    101: assign values from bits [95:80] of \a a. \n
  4156. ///    110: assign values from bits [111:96] of \a a. \n
  4157. ///    111: assign values from bits [127:112] of \a a.
  4158. /// \returns An integer, whose lower 16 bits are selected from the 128-bit
  4159. ///    integer vector parameter and the remaining bits are assigned zeros.
  4160. #define _mm_extract_epi16(a, imm)                                              \
  4161.   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
  4162.                                                     (int)(imm)))
  4163.  
  4164. /// Constructs a 128-bit integer vector by first making a copy of the
  4165. ///    128-bit integer vector parameter, and then inserting the lower 16 bits
  4166. ///    of an integer parameter into an offset specified by the immediate-value
  4167. ///    parameter.
  4168. ///
  4169. /// \headerfile <x86intrin.h>
  4170. ///
  4171. /// \code
  4172. /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
  4173. /// \endcode
  4174. ///
  4175. /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
  4176. ///
  4177. /// \param a
  4178. ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
  4179. ///    result and then one of the eight elements in the result is replaced by
  4180. ///    the lower 16 bits of \a b.
  4181. /// \param b
  4182. ///    An integer. The lower 16 bits of this parameter are written to the
  4183. ///    result beginning at an offset specified by \a imm.
  4184. /// \param imm
  4185. ///    An immediate value specifying the bit offset in the result at which the
  4186. ///    lower 16 bits of \a b are written.
  4187. /// \returns A 128-bit integer vector containing the constructed values.
  4188. #define _mm_insert_epi16(a, b, imm)                                            \
  4189.   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
  4190.                                         (int)(imm)))
  4191.  
  4192. /// Copies the values of the most significant bits from each 8-bit
  4193. ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
  4194. ///    value, zero-extends the value, and writes it to the destination.
  4195. ///
  4196. /// \headerfile <x86intrin.h>
  4197. ///
  4198. /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
  4199. ///
  4200. /// \param __a
  4201. ///    A 128-bit integer vector containing the values with bits to be extracted.
  4202. /// \returns The most significant bits from each 8-bit element in \a __a,
  4203. ///    written to bits [15:0]. The other bits are assigned zeros.
  4204. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
  4205.   return __builtin_ia32_pmovmskb128((__v16qi)__a);
  4206. }
  4207.  
  4208. /// Constructs a 128-bit integer vector by shuffling four 32-bit
  4209. ///    elements of a 128-bit integer vector parameter, using the immediate-value
  4210. ///    parameter as a specifier.
  4211. ///
  4212. /// \headerfile <x86intrin.h>
  4213. ///
  4214. /// \code
  4215. /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
  4216. /// \endcode
  4217. ///
  4218. /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
  4219. ///
  4220. /// \param a
  4221. ///    A 128-bit integer vector containing the values to be copied.
  4222. /// \param imm
  4223. ///    An immediate value containing an 8-bit value specifying which elements to
  4224. ///    copy from a. The destinations within the 128-bit destination are assigned
  4225. ///    values as follows: \n
  4226. ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
  4227. ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
  4228. ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
  4229. ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
  4230. ///    Bit value assignments: \n
  4231. ///    00: assign values from bits [31:0] of \a a. \n
  4232. ///    01: assign values from bits [63:32] of \a a. \n
  4233. ///    10: assign values from bits [95:64] of \a a. \n
  4234. ///    11: assign values from bits [127:96] of \a a. \n
  4235. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
  4236. ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
  4237. ///    <c>[b6, b4, b2, b0]</c>.
  4238. /// \returns A 128-bit integer vector containing the shuffled values.
  4239. #define _mm_shuffle_epi32(a, imm)                                              \
  4240.   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
  4241.  
  4242. /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
  4243. ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
  4244. ///    value parameter as a specifier.
  4245. ///
  4246. /// \headerfile <x86intrin.h>
  4247. ///
  4248. /// \code
  4249. /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
  4250. /// \endcode
  4251. ///
  4252. /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
  4253. ///
  4254. /// \param a
  4255. ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
  4256. ///    [127:64] of the result.
  4257. /// \param imm
  4258. ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
  4259. ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
  4260. ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
  4261. ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
  4262. ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
  4263. ///    Bit value assignments: \n
  4264. ///    00: assign values from bits [15:0] of \a a. \n
  4265. ///    01: assign values from bits [31:16] of \a a. \n
  4266. ///    10: assign values from bits [47:32] of \a a. \n
  4267. ///    11: assign values from bits [63:48] of \a a. \n
  4268. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
  4269. ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
  4270. ///    <c>[b6, b4, b2, b0]</c>.
  4271. /// \returns A 128-bit integer vector containing the shuffled values.
  4272. #define _mm_shufflelo_epi16(a, imm)                                            \
  4273.   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
  4274.  
  4275. /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
  4276. ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
  4277. ///    value parameter as a specifier.
  4278. ///
  4279. /// \headerfile <x86intrin.h>
  4280. ///
  4281. /// \code
  4282. /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
  4283. /// \endcode
  4284. ///
  4285. /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
  4286. ///
  4287. /// \param a
  4288. ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
  4289. ///    [63:0] of the result.
  4290. /// \param imm
  4291. ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
  4292. ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
  4293. ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
  4294. ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
  4295. ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
  4296. ///    Bit value assignments: \n
  4297. ///    00: assign values from bits [79:64] of \a a. \n
  4298. ///    01: assign values from bits [95:80] of \a a. \n
  4299. ///    10: assign values from bits [111:96] of \a a. \n
  4300. ///    11: assign values from bits [127:112] of \a a. \n
  4301. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
  4302. ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
  4303. ///    <c>[b6, b4, b2, b0]</c>.
  4304. /// \returns A 128-bit integer vector containing the shuffled values.
  4305. #define _mm_shufflehi_epi16(a, imm)                                            \
  4306.   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
  4307.  
  4308. /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
  4309. ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
  4310. ///
  4311. /// \headerfile <x86intrin.h>
  4312. ///
  4313. /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
  4314. ///   instruction.
  4315. ///
  4316. /// \param __a
  4317. ///    A 128-bit vector of [16 x i8].
  4318. ///    Bits [71:64] are written to bits [7:0] of the result. \n
  4319. ///    Bits [79:72] are written to bits [23:16] of the result. \n
  4320. ///    Bits [87:80] are written to bits [39:32] of the result. \n
  4321. ///    Bits [95:88] are written to bits [55:48] of the result. \n
  4322. ///    Bits [103:96] are written to bits [71:64] of the result. \n
  4323. ///    Bits [111:104] are written to bits [87:80] of the result. \n
  4324. ///    Bits [119:112] are written to bits [103:96] of the result. \n
  4325. ///    Bits [127:120] are written to bits [119:112] of the result.
  4326. /// \param __b
  4327. ///    A 128-bit vector of [16 x i8]. \n
  4328. ///    Bits [71:64] are written to bits [15:8] of the result. \n
  4329. ///    Bits [79:72] are written to bits [31:24] of the result. \n
  4330. ///    Bits [87:80] are written to bits [47:40] of the result. \n
  4331. ///    Bits [95:88] are written to bits [63:56] of the result. \n
  4332. ///    Bits [103:96] are written to bits [79:72] of the result. \n
  4333. ///    Bits [111:104] are written to bits [95:88] of the result. \n
  4334. ///    Bits [119:112] are written to bits [111:104] of the result. \n
  4335. ///    Bits [127:120] are written to bits [127:120] of the result.
  4336. /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
  4337. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
  4338.                                                                __m128i __b) {
  4339.   return (__m128i)__builtin_shufflevector(
  4340.       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
  4341.       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
  4342. }
  4343.  
  4344. /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
  4345. ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
  4346. ///
  4347. /// \headerfile <x86intrin.h>
  4348. ///
  4349. /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
  4350. ///   instruction.
  4351. ///
  4352. /// \param __a
  4353. ///    A 128-bit vector of [8 x i16].
  4354. ///    Bits [79:64] are written to bits [15:0] of the result. \n
  4355. ///    Bits [95:80] are written to bits [47:32] of the result. \n
  4356. ///    Bits [111:96] are written to bits [79:64] of the result. \n
  4357. ///    Bits [127:112] are written to bits [111:96] of the result.
  4358. /// \param __b
  4359. ///    A 128-bit vector of [8 x i16].
  4360. ///    Bits [79:64] are written to bits [31:16] of the result. \n
  4361. ///    Bits [95:80] are written to bits [63:48] of the result. \n
  4362. ///    Bits [111:96] are written to bits [95:80] of the result. \n
  4363. ///    Bits [127:112] are written to bits [127:112] of the result.
  4364. /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
  4365. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
  4366.                                                                 __m128i __b) {
  4367.   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
  4368.                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
  4369. }
  4370.  
  4371. /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
  4372. ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
  4373. ///
  4374. /// \headerfile <x86intrin.h>
  4375. ///
  4376. /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
  4377. ///   instruction.
  4378. ///
  4379. /// \param __a
  4380. ///    A 128-bit vector of [4 x i32]. \n
  4381. ///    Bits [95:64] are written to bits [31:0] of the destination. \n
  4382. ///    Bits [127:96] are written to bits [95:64] of the destination.
  4383. /// \param __b
  4384. ///    A 128-bit vector of [4 x i32]. \n
  4385. ///    Bits [95:64] are written to bits [64:32] of the destination. \n
  4386. ///    Bits [127:96] are written to bits [127:96] of the destination.
  4387. /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
  4388. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
  4389.                                                                 __m128i __b) {
  4390.   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
  4391.                                           4 + 3);
  4392. }
  4393.  
  4394. /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
  4395. ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
  4396. ///
  4397. /// \headerfile <x86intrin.h>
  4398. ///
  4399. /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
  4400. ///   instruction.
  4401. ///
  4402. /// \param __a
  4403. ///    A 128-bit vector of [2 x i64]. \n
  4404. ///    Bits [127:64] are written to bits [63:0] of the destination.
  4405. /// \param __b
  4406. ///    A 128-bit vector of [2 x i64]. \n
  4407. ///    Bits [127:64] are written to bits [127:64] of the destination.
  4408. /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
  4409. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
  4410.                                                                 __m128i __b) {
  4411.   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
  4412. }
  4413.  
  4414. /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
  4415. ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
  4416. ///
  4417. /// \headerfile <x86intrin.h>
  4418. ///
  4419. /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
  4420. ///   instruction.
  4421. ///
  4422. /// \param __a
  4423. ///    A 128-bit vector of [16 x i8]. \n
  4424. ///    Bits [7:0] are written to bits [7:0] of the result. \n
  4425. ///    Bits [15:8] are written to bits [23:16] of the result. \n
  4426. ///    Bits [23:16] are written to bits [39:32] of the result. \n
  4427. ///    Bits [31:24] are written to bits [55:48] of the result. \n
  4428. ///    Bits [39:32] are written to bits [71:64] of the result. \n
  4429. ///    Bits [47:40] are written to bits [87:80] of the result. \n
  4430. ///    Bits [55:48] are written to bits [103:96] of the result. \n
  4431. ///    Bits [63:56] are written to bits [119:112] of the result.
  4432. /// \param __b
  4433. ///    A 128-bit vector of [16 x i8].
  4434. ///    Bits [7:0] are written to bits [15:8] of the result. \n
  4435. ///    Bits [15:8] are written to bits [31:24] of the result. \n
  4436. ///    Bits [23:16] are written to bits [47:40] of the result. \n
  4437. ///    Bits [31:24] are written to bits [63:56] of the result. \n
  4438. ///    Bits [39:32] are written to bits [79:72] of the result. \n
  4439. ///    Bits [47:40] are written to bits [95:88] of the result. \n
  4440. ///    Bits [55:48] are written to bits [111:104] of the result. \n
  4441. ///    Bits [63:56] are written to bits [127:120] of the result.
  4442. /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
  4443. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
  4444.                                                                __m128i __b) {
  4445.   return (__m128i)__builtin_shufflevector(
  4446.       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
  4447.       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
  4448. }
  4449.  
  4450. /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
  4451. ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
  4452. ///    [8 x i16].
  4453. ///
  4454. /// \headerfile <x86intrin.h>
  4455. ///
  4456. /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
  4457. ///   instruction.
  4458. ///
  4459. /// \param __a
  4460. ///    A 128-bit vector of [8 x i16].
  4461. ///    Bits [15:0] are written to bits [15:0] of the result. \n
  4462. ///    Bits [31:16] are written to bits [47:32] of the result. \n
  4463. ///    Bits [47:32] are written to bits [79:64] of the result. \n
  4464. ///    Bits [63:48] are written to bits [111:96] of the result.
  4465. /// \param __b
  4466. ///    A 128-bit vector of [8 x i16].
  4467. ///    Bits [15:0] are written to bits [31:16] of the result. \n
  4468. ///    Bits [31:16] are written to bits [63:48] of the result. \n
  4469. ///    Bits [47:32] are written to bits [95:80] of the result. \n
  4470. ///    Bits [63:48] are written to bits [127:112] of the result.
  4471. /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
  4472. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
  4473.                                                                 __m128i __b) {
  4474.   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
  4475.                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
  4476. }
  4477.  
  4478. /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
  4479. ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
  4480. ///
  4481. /// \headerfile <x86intrin.h>
  4482. ///
  4483. /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
  4484. ///   instruction.
  4485. ///
  4486. /// \param __a
  4487. ///    A 128-bit vector of [4 x i32]. \n
  4488. ///    Bits [31:0] are written to bits [31:0] of the destination. \n
  4489. ///    Bits [63:32] are written to bits [95:64] of the destination.
  4490. /// \param __b
  4491. ///    A 128-bit vector of [4 x i32]. \n
  4492. ///    Bits [31:0] are written to bits [64:32] of the destination. \n
  4493. ///    Bits [63:32] are written to bits [127:96] of the destination.
  4494. /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
  4495. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
  4496.                                                                 __m128i __b) {
  4497.   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
  4498.                                           4 + 1);
  4499. }
  4500.  
  4501. /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
  4502. ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
  4503. ///
  4504. /// \headerfile <x86intrin.h>
  4505. ///
  4506. /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
  4507. ///   instruction.
  4508. ///
  4509. /// \param __a
  4510. ///    A 128-bit vector of [2 x i64]. \n
  4511. ///    Bits [63:0] are written to bits [63:0] of the destination. \n
  4512. /// \param __b
  4513. ///    A 128-bit vector of [2 x i64]. \n
  4514. ///    Bits [63:0] are written to bits [127:64] of the destination. \n
  4515. /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
  4516. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
  4517.                                                                 __m128i __b) {
  4518.   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
  4519. }
  4520.  
  4521. /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
  4522. ///    integer.
  4523. ///
  4524. /// \headerfile <x86intrin.h>
  4525. ///
  4526. /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
  4527. ///
  4528. /// \param __a
  4529. ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
  4530. ///    destination.
  4531. /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
  4532. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
  4533.   return (__m64)__a[0];
  4534. }
  4535.  
  4536. /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
  4537. ///    upper bits.
  4538. ///
  4539. /// \headerfile <x86intrin.h>
  4540. ///
  4541. /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
  4542. ///
  4543. /// \param __a
  4544. ///    A 64-bit value.
  4545. /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
  4546. ///    the operand. The upper 64 bits are assigned zeros.
  4547. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
  4548.   return __extension__(__m128i)(__v2di){(long long)__a, 0};
  4549. }
  4550.  
  4551. /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
  4552. ///    integer vector, zeroing the upper bits.
  4553. ///
  4554. /// \headerfile <x86intrin.h>
  4555. ///
  4556. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  4557. ///
  4558. /// \param __a
  4559. ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
  4560. ///    destination.
  4561. /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
  4562. ///    the operand. The upper 64 bits are assigned zeros.
  4563. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
  4564.   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
  4565. }
  4566.  
  4567. /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
  4568. ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
  4569. ///    double].
  4570. ///
  4571. /// \headerfile <x86intrin.h>
  4572. ///
  4573. /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
  4574. ///
  4575. /// \param __a
  4576. ///    A 128-bit vector of [2 x double]. \n
  4577. ///    Bits [127:64] are written to bits [63:0] of the destination.
  4578. /// \param __b
  4579. ///    A 128-bit vector of [2 x double]. \n
  4580. ///    Bits [127:64] are written to bits [127:64] of the destination.
  4581. /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
  4582. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
  4583.                                                              __m128d __b) {
  4584.   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
  4585. }
  4586.  
  4587. /// Unpacks the low-order 64-bit elements from two 128-bit vectors
  4588. ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
  4589. ///    double].
  4590. ///
  4591. /// \headerfile <x86intrin.h>
  4592. ///
  4593. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  4594. ///
  4595. /// \param __a
  4596. ///    A 128-bit vector of [2 x double]. \n
  4597. ///    Bits [63:0] are written to bits [63:0] of the destination.
  4598. /// \param __b
  4599. ///    A 128-bit vector of [2 x double]. \n
  4600. ///    Bits [63:0] are written to bits [127:64] of the destination.
  4601. /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
  4602. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
  4603.                                                              __m128d __b) {
  4604.   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
  4605. }
  4606.  
  4607. /// Extracts the sign bits of the double-precision values in the 128-bit
  4608. ///    vector of [2 x double], zero-extends the value, and writes it to the
  4609. ///    low-order bits of the destination.
  4610. ///
  4611. /// \headerfile <x86intrin.h>
  4612. ///
  4613. /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
  4614. ///
  4615. /// \param __a
  4616. ///    A 128-bit vector of [2 x double] containing the values with sign bits to
  4617. ///    be extracted.
  4618. /// \returns The sign bits from each of the double-precision elements in \a __a,
  4619. ///    written to bits [1:0]. The remaining bits are assigned values of zero.
  4620. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
  4621.   return __builtin_ia32_movmskpd((__v2df)__a);
  4622. }
  4623.  
  4624. /// Constructs a 128-bit floating-point vector of [2 x double] from two
  4625. ///    128-bit vector parameters of [2 x double], using the immediate-value
  4626. ///     parameter as a specifier.
  4627. ///
  4628. /// \headerfile <x86intrin.h>
  4629. ///
  4630. /// \code
  4631. /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
  4632. /// \endcode
  4633. ///
  4634. /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
  4635. ///
  4636. /// \param a
  4637. ///    A 128-bit vector of [2 x double].
  4638. /// \param b
  4639. ///    A 128-bit vector of [2 x double].
  4640. /// \param i
  4641. ///    An 8-bit immediate value. The least significant two bits specify which
  4642. ///    elements to copy from \a a and \a b: \n
  4643. ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
  4644. ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
  4645. ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
  4646. ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
  4647. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
  4648. ///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
  4649. ///    <c>[b1, b0]</c>.
  4650. /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
  4651. #define _mm_shuffle_pd(a, b, i)                                                \
  4652.   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
  4653.                                   (int)(i)))
  4654.  
  4655. /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
  4656. ///    floating-point vector of [4 x float].
  4657. ///
  4658. /// \headerfile <x86intrin.h>
  4659. ///
  4660. /// This intrinsic has no corresponding instruction.
  4661. ///
  4662. /// \param __a
  4663. ///    A 128-bit floating-point vector of [2 x double].
  4664. /// \returns A 128-bit floating-point vector of [4 x float] containing the same
  4665. ///    bitwise pattern as the parameter.
  4666. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
  4667.   return (__m128)__a;
  4668. }
  4669.  
  4670. /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
  4671. ///    integer vector.
  4672. ///
  4673. /// \headerfile <x86intrin.h>
  4674. ///
  4675. /// This intrinsic has no corresponding instruction.
  4676. ///
  4677. /// \param __a
  4678. ///    A 128-bit floating-point vector of [2 x double].
  4679. /// \returns A 128-bit integer vector containing the same bitwise pattern as the
  4680. ///    parameter.
  4681. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
  4682.   return (__m128i)__a;
  4683. }
  4684.  
  4685. /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
  4686. ///    floating-point vector of [2 x double].
  4687. ///
  4688. /// \headerfile <x86intrin.h>
  4689. ///
  4690. /// This intrinsic has no corresponding instruction.
  4691. ///
  4692. /// \param __a
  4693. ///    A 128-bit floating-point vector of [4 x float].
  4694. /// \returns A 128-bit floating-point vector of [2 x double] containing the same
  4695. ///    bitwise pattern as the parameter.
  4696. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
  4697.   return (__m128d)__a;
  4698. }
  4699.  
  4700. /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
  4701. ///    integer vector.
  4702. ///
  4703. /// \headerfile <x86intrin.h>
  4704. ///
  4705. /// This intrinsic has no corresponding instruction.
  4706. ///
  4707. /// \param __a
  4708. ///    A 128-bit floating-point vector of [4 x float].
  4709. /// \returns A 128-bit integer vector containing the same bitwise pattern as the
  4710. ///    parameter.
  4711. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
  4712.   return (__m128i)__a;
  4713. }
  4714.  
  4715. /// Casts a 128-bit integer vector into a 128-bit floating-point vector
  4716. ///    of [4 x float].
  4717. ///
  4718. /// \headerfile <x86intrin.h>
  4719. ///
  4720. /// This intrinsic has no corresponding instruction.
  4721. ///
  4722. /// \param __a
  4723. ///    A 128-bit integer vector.
  4724. /// \returns A 128-bit floating-point vector of [4 x float] containing the same
  4725. ///    bitwise pattern as the parameter.
  4726. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
  4727.   return (__m128)__a;
  4728. }
  4729.  
  4730. /// Casts a 128-bit integer vector into a 128-bit floating-point vector
  4731. ///    of [2 x double].
  4732. ///
  4733. /// \headerfile <x86intrin.h>
  4734. ///
  4735. /// This intrinsic has no corresponding instruction.
  4736. ///
  4737. /// \param __a
  4738. ///    A 128-bit integer vector.
  4739. /// \returns A 128-bit floating-point vector of [2 x double] containing the same
  4740. ///    bitwise pattern as the parameter.
  4741. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
  4742.   return (__m128d)__a;
  4743. }
  4744.  
  4745. #if defined(__cplusplus)
  4746. extern "C" {
  4747. #endif
  4748.  
  4749. /// Indicates that a spin loop is being executed for the purposes of
  4750. ///    optimizing power consumption during the loop.
  4751. ///
  4752. /// \headerfile <x86intrin.h>
  4753. ///
  4754. /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
  4755. ///
  4756. void _mm_pause(void);
  4757.  
  4758. #if defined(__cplusplus)
  4759. } // extern "C"
  4760. #endif
  4761. #undef __DEFAULT_FN_ATTRS
  4762. #undef __DEFAULT_FN_ATTRS_MMX
  4763.  
  4764. #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  4765.  
  4766. #define _MM_DENORMALS_ZERO_ON (0x0040U)
  4767. #define _MM_DENORMALS_ZERO_OFF (0x0000U)
  4768.  
  4769. #define _MM_DENORMALS_ZERO_MASK (0x0040U)
  4770.  
  4771. #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
  4772. #define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
  4773.   (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  4774.  
  4775. #endif /* __EMMINTRIN_H */
  4776.