Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __SMMINTRIN_H
  11. #define __SMMINTRIN_H
  12.  
  13. #if !defined(__i386__) && !defined(__x86_64__)
  14. #error "This header is only meant to be used on x86 and x64 architecture"
  15. #endif
  16.  
  17. #include <tmmintrin.h>
  18.  
  19. /* Define the default attributes for the functions in this file. */
  20. #define __DEFAULT_FN_ATTRS                                                     \
  21.   __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
  22.                  __min_vector_width__(128)))
  23.  
  24. /* SSE4 Rounding macros. */
  25. #define _MM_FROUND_TO_NEAREST_INT 0x00
  26. #define _MM_FROUND_TO_NEG_INF 0x01
  27. #define _MM_FROUND_TO_POS_INF 0x02
  28. #define _MM_FROUND_TO_ZERO 0x03
  29. #define _MM_FROUND_CUR_DIRECTION 0x04
  30.  
  31. #define _MM_FROUND_RAISE_EXC 0x00
  32. #define _MM_FROUND_NO_EXC 0x08
  33.  
  34. #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
  35. #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
  36. #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
  37. #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
  38. #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
  39. #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
  40.  
  41. /// Rounds up each element of the 128-bit vector of [4 x float] to an
  42. ///    integer and returns the rounded values in a 128-bit vector of
  43. ///    [4 x float].
  44. ///
  45. /// \headerfile <x86intrin.h>
  46. ///
  47. /// \code
  48. /// __m128 _mm_ceil_ps(__m128 X);
  49. /// \endcode
  50. ///
  51. /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  52. ///
  53. /// \param X
  54. ///    A 128-bit vector of [4 x float] values to be rounded up.
  55. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
  56. #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
  57.  
  58. /// Rounds up each element of the 128-bit vector of [2 x double] to an
  59. ///    integer and returns the rounded values in a 128-bit vector of
  60. ///    [2 x double].
  61. ///
  62. /// \headerfile <x86intrin.h>
  63. ///
  64. /// \code
  65. /// __m128d _mm_ceil_pd(__m128d X);
  66. /// \endcode
  67. ///
  68. /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  69. ///
  70. /// \param X
  71. ///    A 128-bit vector of [2 x double] values to be rounded up.
  72. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
  73. #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
  74.  
  75. /// Copies three upper elements of the first 128-bit vector operand to
  76. ///    the corresponding three upper elements of the 128-bit result vector of
  77. ///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
  78. ///    operand to an integer and copies it to the lowest element of the 128-bit
  79. ///    result vector of [4 x float].
  80. ///
  81. /// \headerfile <x86intrin.h>
  82. ///
  83. /// \code
  84. /// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
  85. /// \endcode
  86. ///
  87. /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  88. ///
  89. /// \param X
  90. ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  91. ///    copied to the corresponding bits of the result.
  92. /// \param Y
  93. ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
  94. ///    rounded up to the nearest integer and copied to the corresponding bits
  95. ///    of the result.
  96. /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
  97. ///    values.
  98. #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
  99.  
  100. /// Copies the upper element of the first 128-bit vector operand to the
  101. ///    corresponding upper element of the 128-bit result vector of [2 x double].
  102. ///    Rounds up the lower element of the second 128-bit vector operand to an
  103. ///    integer and copies it to the lower element of the 128-bit result vector
  104. ///    of [2 x double].
  105. ///
  106. /// \headerfile <x86intrin.h>
  107. ///
  108. /// \code
  109. /// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
  110. /// \endcode
  111. ///
  112. /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
  113. ///
  114. /// \param X
  115. ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
  116. ///    copied to the corresponding bits of the result.
  117. /// \param Y
  118. ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
  119. ///    rounded up to the nearest integer and copied to the corresponding bits
  120. ///    of the result.
  121. /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
  122. ///    values.
  123. #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
  124.  
  125. /// Rounds down each element of the 128-bit vector of [4 x float] to an
  126. ///    an integer and returns the rounded values in a 128-bit vector of
  127. ///    [4 x float].
  128. ///
  129. /// \headerfile <x86intrin.h>
  130. ///
  131. /// \code
  132. /// __m128 _mm_floor_ps(__m128 X);
  133. /// \endcode
  134. ///
  135. /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  136. ///
  137. /// \param X
  138. ///    A 128-bit vector of [4 x float] values to be rounded down.
  139. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
  140. #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
  141.  
  142. /// Rounds down each element of the 128-bit vector of [2 x double] to an
  143. ///    integer and returns the rounded values in a 128-bit vector of
  144. ///    [2 x double].
  145. ///
  146. /// \headerfile <x86intrin.h>
  147. ///
  148. /// \code
  149. /// __m128d _mm_floor_pd(__m128d X);
  150. /// \endcode
  151. ///
  152. /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  153. ///
  154. /// \param X
  155. ///    A 128-bit vector of [2 x double].
  156. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
  157. #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
  158.  
  159. /// Copies three upper elements of the first 128-bit vector operand to
  160. ///    the corresponding three upper elements of the 128-bit result vector of
  161. ///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
  162. ///    operand to an integer and copies it to the lowest element of the 128-bit
  163. ///    result vector of [4 x float].
  164. ///
  165. /// \headerfile <x86intrin.h>
  166. ///
  167. /// \code
  168. /// __m128 _mm_floor_ss(__m128 X, __m128 Y);
  169. /// \endcode
  170. ///
  171. /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  172. ///
  173. /// \param X
  174. ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  175. ///    copied to the corresponding bits of the result.
  176. /// \param Y
  177. ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
  178. ///    rounded down to the nearest integer and copied to the corresponding bits
  179. ///    of the result.
  180. /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
  181. ///    values.
  182. #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
  183.  
  184. /// Copies the upper element of the first 128-bit vector operand to the
  185. ///    corresponding upper element of the 128-bit result vector of [2 x double].
  186. ///    Rounds down the lower element of the second 128-bit vector operand to an
  187. ///    integer and copies it to the lower element of the 128-bit result vector
  188. ///    of [2 x double].
  189. ///
  190. /// \headerfile <x86intrin.h>
  191. ///
  192. /// \code
  193. /// __m128d _mm_floor_sd(__m128d X, __m128d Y);
  194. /// \endcode
  195. ///
  196. /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
  197. ///
  198. /// \param X
  199. ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
  200. ///    copied to the corresponding bits of the result.
  201. /// \param Y
  202. ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
  203. ///    rounded down to the nearest integer and copied to the corresponding bits
  204. ///    of the result.
  205. /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
  206. ///    values.
  207. #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
  208.  
  209. /// Rounds each element of the 128-bit vector of [4 x float] to an
  210. ///    integer value according to the rounding control specified by the second
  211. ///    argument and returns the rounded values in a 128-bit vector of
  212. ///    [4 x float].
  213. ///
  214. /// \headerfile <x86intrin.h>
  215. ///
  216. /// \code
  217. /// __m128 _mm_round_ps(__m128 X, const int M);
  218. /// \endcode
  219. ///
  220. /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  221. ///
  222. /// \param X
  223. ///    A 128-bit vector of [4 x float].
  224. /// \param M
  225. ///    An integer value that specifies the rounding operation. \n
  226. ///    Bits [7:4] are reserved. \n
  227. ///    Bit [3] is a precision exception value: \n
  228. ///      0: A normal PE exception is used \n
  229. ///      1: The PE field is not updated \n
  230. ///    Bit [2] is the rounding control source: \n
  231. ///      0: Use bits [1:0] of \a M \n
  232. ///      1: Use the current MXCSR setting \n
  233. ///    Bits [1:0] contain the rounding control definition: \n
  234. ///      00: Nearest \n
  235. ///      01: Downward (toward negative infinity) \n
  236. ///      10: Upward (toward positive infinity) \n
  237. ///      11: Truncated
  238. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
  239. #define _mm_round_ps(X, M)                                                     \
  240.   ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
  241.  
  242. /// Copies three upper elements of the first 128-bit vector operand to
  243. ///    the corresponding three upper elements of the 128-bit result vector of
  244. ///    [4 x float]. Rounds the lowest element of the second 128-bit vector
  245. ///    operand to an integer value according to the rounding control specified
  246. ///    by the third argument and copies it to the lowest element of the 128-bit
  247. ///    result vector of [4 x float].
  248. ///
  249. /// \headerfile <x86intrin.h>
  250. ///
  251. /// \code
  252. /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
  253. /// \endcode
  254. ///
  255. /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  256. ///
  257. /// \param X
  258. ///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  259. ///    copied to the corresponding bits of the result.
  260. /// \param Y
  261. ///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
  262. ///    rounded to the nearest integer using the specified rounding control and
  263. ///    copied to the corresponding bits of the result.
  264. /// \param M
  265. ///    An integer value that specifies the rounding operation. \n
  266. ///    Bits [7:4] are reserved. \n
  267. ///    Bit [3] is a precision exception value: \n
  268. ///      0: A normal PE exception is used \n
  269. ///      1: The PE field is not updated \n
  270. ///    Bit [2] is the rounding control source: \n
  271. ///      0: Use bits [1:0] of \a M \n
  272. ///      1: Use the current MXCSR setting \n
  273. ///    Bits [1:0] contain the rounding control definition: \n
  274. ///      00: Nearest \n
  275. ///      01: Downward (toward negative infinity) \n
  276. ///      10: Upward (toward positive infinity) \n
  277. ///      11: Truncated
  278. /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
  279. ///    values.
  280. #define _mm_round_ss(X, Y, M)                                                  \
  281.   ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
  282.                                   (M)))
  283.  
  284. /// Rounds each element of the 128-bit vector of [2 x double] to an
  285. ///    integer value according to the rounding control specified by the second
  286. ///    argument and returns the rounded values in a 128-bit vector of
  287. ///    [2 x double].
  288. ///
  289. /// \headerfile <x86intrin.h>
  290. ///
  291. /// \code
  292. /// __m128d _mm_round_pd(__m128d X, const int M);
  293. /// \endcode
  294. ///
  295. /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  296. ///
  297. /// \param X
  298. ///    A 128-bit vector of [2 x double].
  299. /// \param M
  300. ///    An integer value that specifies the rounding operation. \n
  301. ///    Bits [7:4] are reserved. \n
  302. ///    Bit [3] is a precision exception value: \n
  303. ///      0: A normal PE exception is used \n
  304. ///      1: The PE field is not updated \n
  305. ///    Bit [2] is the rounding control source: \n
  306. ///      0: Use bits [1:0] of \a M \n
  307. ///      1: Use the current MXCSR setting \n
  308. ///    Bits [1:0] contain the rounding control definition: \n
  309. ///      00: Nearest \n
  310. ///      01: Downward (toward negative infinity) \n
  311. ///      10: Upward (toward positive infinity) \n
  312. ///      11: Truncated
  313. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
  314. #define _mm_round_pd(X, M)                                                     \
  315.   ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
  316.  
  317. /// Copies the upper element of the first 128-bit vector operand to the
  318. ///    corresponding upper element of the 128-bit result vector of [2 x double].
  319. ///    Rounds the lower element of the second 128-bit vector operand to an
  320. ///    integer value according to the rounding control specified by the third
  321. ///    argument and copies it to the lower element of the 128-bit result vector
  322. ///    of [2 x double].
  323. ///
  324. /// \headerfile <x86intrin.h>
  325. ///
  326. /// \code
  327. /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
  328. /// \endcode
  329. ///
  330. /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
  331. ///
  332. /// \param X
  333. ///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
  334. ///    copied to the corresponding bits of the result.
  335. /// \param Y
  336. ///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
  337. ///    rounded to the nearest integer using the specified rounding control and
  338. ///    copied to the corresponding bits of the result.
  339. /// \param M
  340. ///    An integer value that specifies the rounding operation. \n
  341. ///    Bits [7:4] are reserved. \n
  342. ///    Bit [3] is a precision exception value: \n
  343. ///      0: A normal PE exception is used \n
  344. ///      1: The PE field is not updated \n
  345. ///    Bit [2] is the rounding control source: \n
  346. ///      0: Use bits [1:0] of \a M \n
  347. ///      1: Use the current MXCSR setting \n
  348. ///    Bits [1:0] contain the rounding control definition: \n
  349. ///      00: Nearest \n
  350. ///      01: Downward (toward negative infinity) \n
  351. ///      10: Upward (toward positive infinity) \n
  352. ///      11: Truncated
  353. /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
  354. ///    values.
  355. #define _mm_round_sd(X, Y, M)                                                  \
  356.   ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
  357.                                    (M)))
  358.  
  359. /* SSE4 Packed Blending Intrinsics.  */
  360. /// Returns a 128-bit vector of [2 x double] where the values are
  361. ///    selected from either the first or second operand as specified by the
  362. ///    third operand, the control mask.
  363. ///
  364. /// \headerfile <x86intrin.h>
  365. ///
  366. /// \code
  367. /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
  368. /// \endcode
  369. ///
  370. /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
  371. ///
  372. /// \param V1
  373. ///    A 128-bit vector of [2 x double].
  374. /// \param V2
  375. ///    A 128-bit vector of [2 x double].
  376. /// \param M
  377. ///    An immediate integer operand, with mask bits [1:0] specifying how the
  378. ///    values are to be copied. The position of the mask bit corresponds to the
  379. ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
  380. ///    element in operand \a V1 is copied to the same position in the result.
  381. ///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
  382. ///    is copied to the same position in the result.
  383. /// \returns A 128-bit vector of [2 x double] containing the copied values.
  384. #define _mm_blend_pd(V1, V2, M)                                                \
  385.   ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
  386.                                    (__v2df)(__m128d)(V2), (int)(M)))
  387.  
  388. /// Returns a 128-bit vector of [4 x float] where the values are selected
  389. ///    from either the first or second operand as specified by the third
  390. ///    operand, the control mask.
  391. ///
  392. /// \headerfile <x86intrin.h>
  393. ///
  394. /// \code
  395. /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
  396. /// \endcode
  397. ///
  398. /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
  399. ///
  400. /// \param V1
  401. ///    A 128-bit vector of [4 x float].
  402. /// \param V2
  403. ///    A 128-bit vector of [4 x float].
  404. /// \param M
  405. ///    An immediate integer operand, with mask bits [3:0] specifying how the
  406. ///    values are to be copied. The position of the mask bit corresponds to the
  407. ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
  408. ///    element in operand \a V1 is copied to the same position in the result.
  409. ///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
  410. ///    is copied to the same position in the result.
  411. /// \returns A 128-bit vector of [4 x float] containing the copied values.
  412. #define _mm_blend_ps(V1, V2, M)                                                \
  413.   ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
  414.                                   (int)(M)))
  415.  
  416. /// Returns a 128-bit vector of [2 x double] where the values are
  417. ///    selected from either the first or second operand as specified by the
  418. ///    third operand, the control mask.
  419. ///
  420. /// \headerfile <x86intrin.h>
  421. ///
  422. /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
  423. ///
  424. /// \param __V1
  425. ///    A 128-bit vector of [2 x double].
  426. /// \param __V2
  427. ///    A 128-bit vector of [2 x double].
  428. /// \param __M
  429. ///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
  430. ///    values are to be copied. The position of the mask bit corresponds to the
  431. ///    most significant bit of a copied value. When a mask bit is 0, the
  432. ///    corresponding 64-bit element in operand \a __V1 is copied to the same
  433. ///    position in the result. When a mask bit is 1, the corresponding 64-bit
  434. ///    element in operand \a __V2 is copied to the same position in the result.
  435. /// \returns A 128-bit vector of [2 x double] containing the copied values.
  436. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
  437.                                                            __m128d __V2,
  438.                                                            __m128d __M) {
  439.   return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
  440.                                           (__v2df)__M);
  441. }
  442.  
  443. /// Returns a 128-bit vector of [4 x float] where the values are
  444. ///    selected from either the first or second operand as specified by the
  445. ///    third operand, the control mask.
  446. ///
  447. /// \headerfile <x86intrin.h>
  448. ///
  449. /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
  450. ///
  451. /// \param __V1
  452. ///    A 128-bit vector of [4 x float].
  453. /// \param __V2
  454. ///    A 128-bit vector of [4 x float].
  455. /// \param __M
  456. ///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
  457. ///    how the values are to be copied. The position of the mask bit corresponds
  458. ///    to the most significant bit of a copied value. When a mask bit is 0, the
  459. ///    corresponding 32-bit element in operand \a __V1 is copied to the same
  460. ///    position in the result. When a mask bit is 1, the corresponding 32-bit
  461. ///    element in operand \a __V2 is copied to the same position in the result.
  462. /// \returns A 128-bit vector of [4 x float] containing the copied values.
  463. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
  464.                                                           __m128 __V2,
  465.                                                           __m128 __M) {
  466.   return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
  467.                                          (__v4sf)__M);
  468. }
  469.  
  470. /// Returns a 128-bit vector of [16 x i8] where the values are selected
  471. ///    from either of the first or second operand as specified by the third
  472. ///    operand, the control mask.
  473. ///
  474. /// \headerfile <x86intrin.h>
  475. ///
  476. /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
  477. ///
  478. /// \param __V1
  479. ///    A 128-bit vector of [16 x i8].
  480. /// \param __V2
  481. ///    A 128-bit vector of [16 x i8].
  482. /// \param __M
  483. ///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
  484. ///    how the values are to be copied. The position of the mask bit corresponds
  485. ///    to the most significant bit of a copied value. When a mask bit is 0, the
  486. ///    corresponding 8-bit element in operand \a __V1 is copied to the same
  487. ///    position in the result. When a mask bit is 1, the corresponding 8-bit
  488. ///    element in operand \a __V2 is copied to the same position in the result.
  489. /// \returns A 128-bit vector of [16 x i8] containing the copied values.
  490. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
  491.                                                              __m128i __V2,
  492.                                                              __m128i __M) {
  493.   return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
  494.                                              (__v16qi)__M);
  495. }
  496.  
  497. /// Returns a 128-bit vector of [8 x i16] where the values are selected
  498. ///    from either of the first or second operand as specified by the third
  499. ///    operand, the control mask.
  500. ///
  501. /// \headerfile <x86intrin.h>
  502. ///
  503. /// \code
  504. /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
  505. /// \endcode
  506. ///
  507. /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
  508. ///
  509. /// \param V1
  510. ///    A 128-bit vector of [8 x i16].
  511. /// \param V2
  512. ///    A 128-bit vector of [8 x i16].
  513. /// \param M
  514. ///    An immediate integer operand, with mask bits [7:0] specifying how the
  515. ///    values are to be copied. The position of the mask bit corresponds to the
  516. ///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
  517. ///    element in operand \a V1 is copied to the same position in the result.
  518. ///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
  519. ///    is copied to the same position in the result.
  520. /// \returns A 128-bit vector of [8 x i16] containing the copied values.
  521. #define _mm_blend_epi16(V1, V2, M)                                             \
  522.   ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
  523.                                       (__v8hi)(__m128i)(V2), (int)(M)))
  524.  
  525. /* SSE4 Dword Multiply Instructions.  */
  526. /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
  527. ///    and returns the lower 32 bits of the each product in a 128-bit vector of
  528. ///    [4 x i32].
  529. ///
  530. /// \headerfile <x86intrin.h>
  531. ///
  532. /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
  533. ///
  534. /// \param __V1
  535. ///    A 128-bit integer vector.
  536. /// \param __V2
  537. ///    A 128-bit integer vector.
  538. /// \returns A 128-bit integer vector containing the products of both operands.
  539. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
  540.                                                              __m128i __V2) {
  541.   return (__m128i)((__v4su)__V1 * (__v4su)__V2);
  542. }
  543.  
  544. /// Multiplies corresponding even-indexed elements of two 128-bit
  545. ///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
  546. ///    containing the products.
  547. ///
  548. /// \headerfile <x86intrin.h>
  549. ///
  550. /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
  551. ///
  552. /// \param __V1
  553. ///    A 128-bit vector of [4 x i32].
  554. /// \param __V2
  555. ///    A 128-bit vector of [4 x i32].
  556. /// \returns A 128-bit vector of [2 x i64] containing the products of both
  557. ///    operands.
  558. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
  559.                                                            __m128i __V2) {
  560.   return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
  561. }
  562.  
  563. /* SSE4 Floating Point Dot Product Instructions.  */
  564. /// Computes the dot product of the two 128-bit vectors of [4 x float]
  565. ///    and returns it in the elements of the 128-bit result vector of
  566. ///    [4 x float].
  567. ///
  568. ///    The immediate integer operand controls which input elements
  569. ///    will contribute to the dot product, and where the final results are
  570. ///    returned.
  571. ///
  572. /// \headerfile <x86intrin.h>
  573. ///
  574. /// \code
  575. /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
  576. /// \endcode
  577. ///
  578. /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
  579. ///
  580. /// \param X
  581. ///    A 128-bit vector of [4 x float].
  582. /// \param Y
  583. ///    A 128-bit vector of [4 x float].
  584. /// \param M
  585. ///    An immediate integer operand. Mask bits [7:4] determine which elements
  586. ///    of the input vectors are used, with bit [4] corresponding to the lowest
  587. ///    element and bit [7] corresponding to the highest element of each [4 x
  588. ///    float] vector. If a bit is set, the corresponding elements from the two
  589. ///    input vectors are used as an input for dot product; otherwise that input
  590. ///    is treated as zero. Bits [3:0] determine which elements of the result
  591. ///    will receive a copy of the final dot product, with bit [0] corresponding
  592. ///    to the lowest element and bit [3] corresponding to the highest element of
  593. ///    each [4 x float] subvector. If a bit is set, the dot product is returned
  594. ///    in the corresponding element; otherwise that element is set to zero.
  595. /// \returns A 128-bit vector of [4 x float] containing the dot product.
  596. #define _mm_dp_ps(X, Y, M)                                                     \
  597.   ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
  598.  
  599. /// Computes the dot product of the two 128-bit vectors of [2 x double]
  600. ///    and returns it in the elements of the 128-bit result vector of
  601. ///    [2 x double].
  602. ///
  603. ///    The immediate integer operand controls which input
  604. ///    elements will contribute to the dot product, and where the final results
  605. ///    are returned.
  606. ///
  607. /// \headerfile <x86intrin.h>
  608. ///
  609. /// \code
  610. /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
  611. /// \endcode
  612. ///
  613. /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
  614. ///
  615. /// \param X
  616. ///    A 128-bit vector of [2 x double].
  617. /// \param Y
  618. ///    A 128-bit vector of [2 x double].
  619. /// \param M
  620. ///    An immediate integer operand. Mask bits [5:4] determine which elements
  621. ///    of the input vectors are used, with bit [4] corresponding to the lowest
  622. ///    element and bit [5] corresponding to the highest element of each of [2 x
  623. ///    double] vector. If a bit is set, the corresponding elements from the two
  624. ///    input vectors are used as an input for dot product; otherwise that input
  625. ///    is treated as zero. Bits [1:0] determine which elements of the result
  626. ///    will receive a copy of the final dot product, with bit [0] corresponding
  627. ///    to the lowest element and bit [1] corresponding to the highest element of
  628. ///    each [2 x double] vector. If a bit is set, the dot product is returned in
  629. ///    the corresponding element; otherwise that element is set to zero.
  630. #define _mm_dp_pd(X, Y, M)                                                     \
  631.   ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
  632.                                 (M)))
  633.  
  634. /* SSE4 Streaming Load Hint Instruction.  */
  635. /// Loads integer values from a 128-bit aligned memory location to a
  636. ///    128-bit integer vector.
  637. ///
  638. /// \headerfile <x86intrin.h>
  639. ///
  640. /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
  641. ///
  642. /// \param __V
  643. ///    A pointer to a 128-bit aligned memory location that contains the integer
  644. ///    values.
  645. /// \returns A 128-bit integer vector containing the data stored at the
  646. ///    specified memory location.
  647. static __inline__ __m128i __DEFAULT_FN_ATTRS
  648. _mm_stream_load_si128(__m128i const *__V) {
  649.   return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
  650. }
  651.  
  652. /* SSE4 Packed Integer Min/Max Instructions.  */
  653. /// Compares the corresponding elements of two 128-bit vectors of
  654. ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
  655. ///    of the two values.
  656. ///
  657. /// \headerfile <x86intrin.h>
  658. ///
  659. /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
  660. ///
  661. /// \param __V1
  662. ///    A 128-bit vector of [16 x i8].
  663. /// \param __V2
  664. ///    A 128-bit vector of [16 x i8]
  665. /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
  666. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
  667.                                                           __m128i __V2) {
  668.   return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
  669. }
  670.  
  671. /// Compares the corresponding elements of two 128-bit vectors of
  672. ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
  673. ///    greater value of the two.
  674. ///
  675. /// \headerfile <x86intrin.h>
  676. ///
  677. /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
  678. ///
  679. /// \param __V1
  680. ///    A 128-bit vector of [16 x i8].
  681. /// \param __V2
  682. ///    A 128-bit vector of [16 x i8].
  683. /// \returns A 128-bit vector of [16 x i8] containing the greater values.
  684. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
  685.                                                           __m128i __V2) {
  686.   return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
  687. }
  688.  
  689. /// Compares the corresponding elements of two 128-bit vectors of
  690. ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
  691. ///    value of the two.
  692. ///
  693. /// \headerfile <x86intrin.h>
  694. ///
  695. /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
  696. ///
  697. /// \param __V1
  698. ///    A 128-bit vector of [8 x u16].
  699. /// \param __V2
  700. ///    A 128-bit vector of [8 x u16].
  701. /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
  702. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
  703.                                                            __m128i __V2) {
  704.   return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
  705. }
  706.  
  707. /// Compares the corresponding elements of two 128-bit vectors of
  708. ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
  709. ///    greater value of the two.
  710. ///
  711. /// \headerfile <x86intrin.h>
  712. ///
  713. /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
  714. ///
  715. /// \param __V1
  716. ///    A 128-bit vector of [8 x u16].
  717. /// \param __V2
  718. ///    A 128-bit vector of [8 x u16].
  719. /// \returns A 128-bit vector of [8 x u16] containing the greater values.
  720. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
  721.                                                            __m128i __V2) {
  722.   return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
  723. }
  724.  
  725. /// Compares the corresponding elements of two 128-bit vectors of
  726. ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
  727. ///    value of the two.
  728. ///
  729. /// \headerfile <x86intrin.h>
  730. ///
  731. /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
  732. ///
  733. /// \param __V1
  734. ///    A 128-bit vector of [4 x i32].
  735. /// \param __V2
  736. ///    A 128-bit vector of [4 x i32].
  737. /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
  738. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
  739.                                                            __m128i __V2) {
  740.   return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
  741. }
  742.  
  743. /// Compares the corresponding elements of two 128-bit vectors of
  744. ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
  745. ///    greater value of the two.
  746. ///
  747. /// \headerfile <x86intrin.h>
  748. ///
  749. /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
  750. ///
  751. /// \param __V1
  752. ///    A 128-bit vector of [4 x i32].
  753. /// \param __V2
  754. ///    A 128-bit vector of [4 x i32].
  755. /// \returns A 128-bit vector of [4 x i32] containing the greater values.
  756. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
  757.                                                            __m128i __V2) {
  758.   return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
  759. }
  760.  
  761. /// Compares the corresponding elements of two 128-bit vectors of
  762. ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
  763. ///    value of the two.
  764. ///
  765. /// \headerfile <x86intrin.h>
  766. ///
  767. /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
  768. ///
  769. /// \param __V1
  770. ///    A 128-bit vector of [4 x u32].
  771. /// \param __V2
  772. ///    A 128-bit vector of [4 x u32].
  773. /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
  774. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
  775.                                                            __m128i __V2) {
  776.   return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
  777. }
  778.  
  779. /// Compares the corresponding elements of two 128-bit vectors of
  780. ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
  781. ///    greater value of the two.
  782. ///
  783. /// \headerfile <x86intrin.h>
  784. ///
  785. /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
  786. ///
  787. /// \param __V1
  788. ///    A 128-bit vector of [4 x u32].
  789. /// \param __V2
  790. ///    A 128-bit vector of [4 x u32].
  791. /// \returns A 128-bit vector of [4 x u32] containing the greater values.
  792. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
  793.                                                            __m128i __V2) {
  794.   return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
  795. }
  796.  
  797. /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
  798. /// Takes the first argument \a X and inserts an element from the second
  799. ///    argument \a Y as selected by the third argument \a N. That result then
  800. ///    has elements zeroed out also as selected by the third argument \a N. The
  801. ///    resulting 128-bit vector of [4 x float] is then returned.
  802. ///
  803. /// \headerfile <x86intrin.h>
  804. ///
  805. /// \code
  806. /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
  807. /// \endcode
  808. ///
  809. /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
  810. ///
  811. /// \param X
  812. ///    A 128-bit vector source operand of [4 x float]. With the exception of
  813. ///    those bits in the result copied from parameter \a Y and zeroed by bits
  814. ///    [3:0] of \a N, all bits from this parameter are copied to the result.
  815. /// \param Y
  816. ///    A 128-bit vector source operand of [4 x float]. One single-precision
  817. ///    floating-point element from this source, as determined by the immediate
  818. ///    parameter, is copied to the result.
  819. /// \param N
  820. ///    Specifies which bits from operand \a Y will be copied, which bits in the
  821. ///    result they will be copied to, and which bits in the result will be
  822. ///    cleared. The following assignments are made: \n
  823. ///    Bits [7:6] specify the bits to copy from operand \a Y: \n
  824. ///      00: Selects bits [31:0] from operand \a Y. \n
  825. ///      01: Selects bits [63:32] from operand \a Y. \n
  826. ///      10: Selects bits [95:64] from operand \a Y. \n
  827. ///      11: Selects bits [127:96] from operand \a Y. \n
  828. ///    Bits [5:4] specify the bits in the result to which the selected bits
  829. ///    from operand \a Y are copied: \n
  830. ///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
  831. ///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
  832. ///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
  833. ///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
  834. ///    Bits[3:0]: If any of these bits are set, the corresponding result
  835. ///    element is cleared.
  836. /// \returns A 128-bit vector of [4 x float] containing the copied
  837. ///    single-precision floating point elements from the operands.
  838. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
  839.  
  840. /// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
  841. ///    returns it, using the immediate value parameter \a N as a selector.
  842. ///
  843. /// \headerfile <x86intrin.h>
  844. ///
  845. /// \code
  846. /// int _mm_extract_ps(__m128 X, const int N);
  847. /// \endcode
  848. ///
  849. /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
  850. /// instruction.
  851. ///
  852. /// \param X
  853. ///    A 128-bit vector of [4 x float].
  854. /// \param N
  855. ///    An immediate value. Bits [1:0] determines which bits from the argument
  856. ///    \a X are extracted and returned: \n
  857. ///    00: Bits [31:0] of parameter \a X are returned. \n
  858. ///    01: Bits [63:32] of parameter \a X are returned. \n
  859. ///    10: Bits [95:64] of parameter \a X are returned. \n
  860. ///    11: Bits [127:96] of parameter \a X are returned.
  861. /// \returns A 32-bit integer containing the extracted 32 bits of float data.
  862. #define _mm_extract_ps(X, N)                                                   \
  863.   __builtin_bit_cast(                                                          \
  864.       int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
  865.  
  866. /* Miscellaneous insert and extract macros.  */
  867. /* Extract a single-precision float from X at index N into D.  */
  868. #define _MM_EXTRACT_FLOAT(D, X, N)                                             \
  869.   do {                                                                         \
  870.     (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
  871.   } while (0)
  872.  
  873. /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
  874.    an index suitable for _mm_insert_ps.  */
  875. #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
  876.  
  877. /* Extract a float from X at index N into the first index of the return.  */
  878. #define _MM_PICK_OUT_PS(X, N)                                                  \
  879.   _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
  880.  
  881. /* Insert int into packed integer array at index.  */
  882. /// Constructs a 128-bit vector of [16 x i8] by first making a copy of
  883. ///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
  884. ///    of an integer parameter \a I into an offset specified by the immediate
  885. ///    value parameter \a N.
  886. ///
  887. /// \headerfile <x86intrin.h>
  888. ///
  889. /// \code
  890. /// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
  891. /// \endcode
  892. ///
  893. /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
  894. ///
  895. /// \param X
  896. ///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
  897. ///    result and then one of the sixteen elements in the result vector is
  898. ///    replaced by the lower 8 bits of \a I.
  899. /// \param I
  900. ///    An integer. The lower 8 bits of this operand are written to the result
  901. ///    beginning at the offset specified by \a N.
  902. /// \param N
  903. ///    An immediate value. Bits [3:0] specify the bit offset in the result at
  904. ///    which the lower 8 bits of \a I are written. \n
  905. ///    0000: Bits [7:0] of the result are used for insertion. \n
  906. ///    0001: Bits [15:8] of the result are used for insertion. \n
  907. ///    0010: Bits [23:16] of the result are used for insertion. \n
  908. ///    0011: Bits [31:24] of the result are used for insertion. \n
  909. ///    0100: Bits [39:32] of the result are used for insertion. \n
  910. ///    0101: Bits [47:40] of the result are used for insertion. \n
  911. ///    0110: Bits [55:48] of the result are used for insertion. \n
  912. ///    0111: Bits [63:56] of the result are used for insertion. \n
  913. ///    1000: Bits [71:64] of the result are used for insertion. \n
  914. ///    1001: Bits [79:72] of the result are used for insertion. \n
  915. ///    1010: Bits [87:80] of the result are used for insertion. \n
  916. ///    1011: Bits [95:88] of the result are used for insertion. \n
  917. ///    1100: Bits [103:96] of the result are used for insertion. \n
  918. ///    1101: Bits [111:104] of the result are used for insertion. \n
  919. ///    1110: Bits [119:112] of the result are used for insertion. \n
  920. ///    1111: Bits [127:120] of the result are used for insertion.
  921. /// \returns A 128-bit integer vector containing the constructed values.
  922. #define _mm_insert_epi8(X, I, N)                                               \
  923.   ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
  924.                                          (int)(N)))
  925.  
  926. /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
  927. ///    the 128-bit integer vector parameter, and then inserting the 32-bit
  928. ///    integer parameter \a I at the offset specified by the immediate value
  929. ///    parameter \a N.
  930. ///
  931. /// \headerfile <x86intrin.h>
  932. ///
  933. /// \code
  934. /// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
  935. /// \endcode
  936. ///
  937. /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
  938. ///
  939. /// \param X
  940. ///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
  941. ///    result and then one of the four elements in the result vector is
  942. ///    replaced by \a I.
  943. /// \param I
  944. ///    A 32-bit integer that is written to the result beginning at the offset
  945. ///    specified by \a N.
  946. /// \param N
  947. ///    An immediate value. Bits [1:0] specify the bit offset in the result at
  948. ///    which the integer \a I is written. \n
  949. ///    00: Bits [31:0] of the result are used for insertion. \n
  950. ///    01: Bits [63:32] of the result are used for insertion. \n
  951. ///    10: Bits [95:64] of the result are used for insertion. \n
  952. ///    11: Bits [127:96] of the result are used for insertion.
  953. /// \returns A 128-bit integer vector containing the constructed values.
  954. #define _mm_insert_epi32(X, I, N)                                              \
  955.   ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
  956.                                         (int)(N)))
  957.  
  958. #ifdef __x86_64__
  959. /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
  960. ///    the 128-bit integer vector parameter, and then inserting the 64-bit
  961. ///    integer parameter \a I, using the immediate value parameter \a N as an
  962. ///    insertion location selector.
  963. ///
  964. /// \headerfile <x86intrin.h>
  965. ///
  966. /// \code
  967. /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
  968. /// \endcode
  969. ///
  970. /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
  971. ///
  972. /// \param X
  973. ///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
  974. ///    result and then one of the two elements in the result vector is replaced
  975. ///    by \a I.
  976. /// \param I
  977. ///    A 64-bit integer that is written to the result beginning at the offset
  978. ///    specified by \a N.
  979. /// \param N
  980. ///    An immediate value. Bit [0] specifies the bit offset in the result at
  981. ///    which the integer \a I is written. \n
  982. ///    0: Bits [63:0] of the result are used for insertion. \n
  983. ///    1: Bits [127:64] of the result are used for insertion. \n
  984. /// \returns A 128-bit integer vector containing the constructed values.
  985. #define _mm_insert_epi64(X, I, N)                                              \
  986.   ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
  987.                                         (int)(N)))
  988. #endif /* __x86_64__ */
  989.  
  990. /* Extract int from packed integer array at index.  This returns the element
  991.  * as a zero extended value, so it is unsigned.
  992.  */
  993. /// Extracts an 8-bit element from the 128-bit integer vector of
  994. ///    [16 x i8], using the immediate value parameter \a N as a selector.
  995. ///
  996. /// \headerfile <x86intrin.h>
  997. ///
  998. /// \code
  999. /// int _mm_extract_epi8(__m128i X, const int N);
  1000. /// \endcode
  1001. ///
  1002. /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
  1003. ///
  1004. /// \param X
  1005. ///    A 128-bit integer vector.
  1006. /// \param N
  1007. ///    An immediate value. Bits [3:0] specify which 8-bit vector element from
  1008. ///    the argument \a X to extract and copy to the result. \n
  1009. ///    0000: Bits [7:0] of parameter \a X are extracted. \n
  1010. ///    0001: Bits [15:8] of the parameter \a X are extracted. \n
  1011. ///    0010: Bits [23:16] of the parameter \a X are extracted. \n
  1012. ///    0011: Bits [31:24] of the parameter \a X are extracted. \n
  1013. ///    0100: Bits [39:32] of the parameter \a X are extracted. \n
  1014. ///    0101: Bits [47:40] of the parameter \a X are extracted. \n
  1015. ///    0110: Bits [55:48] of the parameter \a X are extracted. \n
  1016. ///    0111: Bits [63:56] of the parameter \a X are extracted. \n
  1017. ///    1000: Bits [71:64] of the parameter \a X are extracted. \n
  1018. ///    1001: Bits [79:72] of the parameter \a X are extracted. \n
  1019. ///    1010: Bits [87:80] of the parameter \a X are extracted. \n
  1020. ///    1011: Bits [95:88] of the parameter \a X are extracted. \n
  1021. ///    1100: Bits [103:96] of the parameter \a X are extracted. \n
  1022. ///    1101: Bits [111:104] of the parameter \a X are extracted. \n
  1023. ///    1110: Bits [119:112] of the parameter \a X are extracted. \n
  1024. ///    1111: Bits [127:120] of the parameter \a X are extracted.
  1025. /// \returns  An unsigned integer, whose lower 8 bits are selected from the
  1026. ///    128-bit integer vector parameter and the remaining bits are assigned
  1027. ///    zeros.
  1028. #define _mm_extract_epi8(X, N)                                                 \
  1029.   ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
  1030.                                                     (int)(N)))
  1031.  
  1032. /// Extracts a 32-bit element from the 128-bit integer vector of
  1033. ///    [4 x i32], using the immediate value parameter \a N as a selector.
  1034. ///
  1035. /// \headerfile <x86intrin.h>
  1036. ///
  1037. /// \code
  1038. /// int _mm_extract_epi32(__m128i X, const int N);
  1039. /// \endcode
  1040. ///
  1041. /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
  1042. ///
  1043. /// \param X
  1044. ///    A 128-bit integer vector.
  1045. /// \param N
  1046. ///    An immediate value. Bits [1:0] specify which 32-bit vector element from
  1047. ///    the argument \a X to extract and copy to the result. \n
  1048. ///    00: Bits [31:0] of the parameter \a X are extracted. \n
  1049. ///    01: Bits [63:32] of the parameter \a X are extracted. \n
  1050. ///    10: Bits [95:64] of the parameter \a X are extracted. \n
  1051. ///    11: Bits [127:96] of the parameter \a X are exracted.
  1052. /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
  1053. ///    integer vector parameter and the remaining bits are assigned zeros.
  1054. #define _mm_extract_epi32(X, N)                                                \
  1055.   ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
  1056.  
  1057. /// Extracts a 64-bit element from the 128-bit integer vector of
  1058. ///    [2 x i64], using the immediate value parameter \a N as a selector.
  1059. ///
  1060. /// \headerfile <x86intrin.h>
  1061. ///
  1062. /// \code
  1063. /// long long _mm_extract_epi64(__m128i X, const int N);
  1064. /// \endcode
  1065. ///
  1066. /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
  1067. /// in 64-bit mode.
  1068. ///
  1069. /// \param X
  1070. ///    A 128-bit integer vector.
  1071. /// \param N
  1072. ///    An immediate value. Bit [0] specifies which 64-bit vector element from
  1073. ///    the argument \a X to return. \n
  1074. ///    0: Bits [63:0] are returned. \n
  1075. ///    1: Bits [127:64] are returned. \n
  1076. /// \returns  A 64-bit integer.
  1077. #define _mm_extract_epi64(X, N)                                                \
  1078.   ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
  1079.  
  1080. /* SSE4 128-bit Packed Integer Comparisons.  */
  1081. /// Tests whether the specified bits in a 128-bit integer vector are all
  1082. ///    zeros.
  1083. ///
  1084. /// \headerfile <x86intrin.h>
  1085. ///
  1086. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1087. ///
  1088. /// \param __M
  1089. ///    A 128-bit integer vector containing the bits to be tested.
  1090. /// \param __V
  1091. ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
  1092. /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
  1093. static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
  1094.                                                          __m128i __V) {
  1095.   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
  1096. }
  1097.  
  1098. /// Tests whether the specified bits in a 128-bit integer vector are all
  1099. ///    ones.
  1100. ///
  1101. /// \headerfile <x86intrin.h>
  1102. ///
  1103. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1104. ///
  1105. /// \param __M
  1106. ///    A 128-bit integer vector containing the bits to be tested.
  1107. /// \param __V
  1108. ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
  1109. /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
  1110. static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
  1111.                                                          __m128i __V) {
  1112.   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
  1113. }
  1114.  
  1115. /// Tests whether the specified bits in a 128-bit integer vector are
  1116. ///    neither all zeros nor all ones.
  1117. ///
  1118. /// \headerfile <x86intrin.h>
  1119. ///
  1120. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1121. ///
  1122. /// \param __M
  1123. ///    A 128-bit integer vector containing the bits to be tested.
  1124. /// \param __V
  1125. ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
  1126. /// \returns TRUE if the specified bits are neither all zeros nor all ones;
  1127. ///    FALSE otherwise.
  1128. static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
  1129.                                                            __m128i __V) {
  1130.   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
  1131. }
  1132.  
  1133. /// Tests whether the specified bits in a 128-bit integer vector are all
  1134. ///    ones.
  1135. ///
  1136. /// \headerfile <x86intrin.h>
  1137. ///
  1138. /// \code
  1139. /// int _mm_test_all_ones(__m128i V);
  1140. /// \endcode
  1141. ///
  1142. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1143. ///
  1144. /// \param V
  1145. ///    A 128-bit integer vector containing the bits to be tested.
  1146. /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
  1147. ///    otherwise.
  1148. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
  1149.  
  1150. /// Tests whether the specified bits in a 128-bit integer vector are
  1151. ///    neither all zeros nor all ones.
  1152. ///
  1153. /// \headerfile <x86intrin.h>
  1154. ///
  1155. /// \code
  1156. /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
  1157. /// \endcode
  1158. ///
  1159. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1160. ///
  1161. /// \param M
  1162. ///    A 128-bit integer vector containing the bits to be tested.
  1163. /// \param V
  1164. ///    A 128-bit integer vector selecting which bits to test in operand \a M.
  1165. /// \returns TRUE if the specified bits are neither all zeros nor all ones;
  1166. ///    FALSE otherwise.
  1167. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
  1168.  
  1169. /// Tests whether the specified bits in a 128-bit integer vector are all
  1170. ///    zeros.
  1171. ///
  1172. /// \headerfile <x86intrin.h>
  1173. ///
  1174. /// \code
  1175. /// int _mm_test_all_zeros(__m128i M, __m128i V);
  1176. /// \endcode
  1177. ///
  1178. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1179. ///
  1180. /// \param M
  1181. ///    A 128-bit integer vector containing the bits to be tested.
  1182. /// \param V
  1183. ///    A 128-bit integer vector selecting which bits to test in operand \a M.
  1184. /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
  1185. #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
  1186.  
  1187. /* SSE4 64-bit Packed Integer Comparisons.  */
  1188. /// Compares each of the corresponding 64-bit values of the 128-bit
  1189. ///    integer vectors for equality.
  1190. ///
  1191. /// \headerfile <x86intrin.h>
  1192. ///
  1193. /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
  1194. ///
  1195. /// \param __V1
  1196. ///    A 128-bit integer vector.
  1197. /// \param __V2
  1198. ///    A 128-bit integer vector.
  1199. /// \returns A 128-bit integer vector containing the comparison results.
  1200. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
  1201.                                                              __m128i __V2) {
  1202.   return (__m128i)((__v2di)__V1 == (__v2di)__V2);
  1203. }
  1204.  
  1205. /* SSE4 Packed Integer Sign-Extension.  */
  1206. /// Sign-extends each of the lower eight 8-bit integer elements of a
  1207. ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
  1208. ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
  1209. ///    are unused.
  1210. ///
  1211. /// \headerfile <x86intrin.h>
  1212. ///
  1213. /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
  1214. ///
  1215. /// \param __V
  1216. ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
  1217. ///    sign-extended to 16-bit values.
  1218. /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
  1219. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
  1220.   /* This function always performs a signed extension, but __v16qi is a char
  1221.      which may be signed or unsigned, so use __v16qs. */
  1222.   return (__m128i) __builtin_convertvector(
  1223.       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
  1224.                               7),
  1225.       __v8hi);
  1226. }
  1227.  
  1228. /// Sign-extends each of the lower four 8-bit integer elements of a
  1229. ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
  1230. ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
  1231. ///    vector are unused.
  1232. ///
  1233. /// \headerfile <x86intrin.h>
  1234. ///
  1235. /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
  1236. ///
  1237. /// \param __V
  1238. ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
  1239. ///    sign-extended to 32-bit values.
  1240. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
  1241. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
  1242.   /* This function always performs a signed extension, but __v16qi is a char
  1243.      which may be signed or unsigned, so use __v16qs. */
  1244.   return (__m128i) __builtin_convertvector(
  1245.       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
  1246. }
  1247.  
  1248. /// Sign-extends each of the lower two 8-bit integer elements of a
  1249. ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
  1250. ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
  1251. ///    vector are unused.
  1252. ///
  1253. /// \headerfile <x86intrin.h>
  1254. ///
  1255. /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
  1256. ///
  1257. /// \param __V
  1258. ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
  1259. ///    sign-extended to 64-bit values.
  1260. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
  1261. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
  1262.   /* This function always performs a signed extension, but __v16qi is a char
  1263.      which may be signed or unsigned, so use __v16qs. */
  1264.   return (__m128i) __builtin_convertvector(
  1265.       __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
  1266. }
  1267.  
  1268. /// Sign-extends each of the lower four 16-bit integer elements of a
  1269. ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
  1270. ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
  1271. ///    vector are unused.
  1272. ///
  1273. /// \headerfile <x86intrin.h>
  1274. ///
  1275. /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
  1276. ///
  1277. /// \param __V
  1278. ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
  1279. ///    sign-extended to 32-bit values.
  1280. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
  1281. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
  1282.   return (__m128i) __builtin_convertvector(
  1283.       __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
  1284. }
  1285.  
  1286. /// Sign-extends each of the lower two 16-bit integer elements of a
  1287. ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
  1288. ///    a 128-bit vector of [2 x i64]. The upper six elements of the input
  1289. ///    vector are unused.
  1290. ///
  1291. /// \headerfile <x86intrin.h>
  1292. ///
  1293. /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
  1294. ///
  1295. /// \param __V
  1296. ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
  1297. ///     sign-extended to 64-bit values.
  1298. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
  1299. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
  1300.   return (__m128i) __builtin_convertvector(
  1301.       __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
  1302. }
  1303.  
  1304. /// Sign-extends each of the lower two 32-bit integer elements of a
  1305. ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
  1306. ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
  1307. ///    are unused.
  1308. ///
  1309. /// \headerfile <x86intrin.h>
  1310. ///
  1311. /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
  1312. ///
  1313. /// \param __V
  1314. ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
  1315. ///    sign-extended to 64-bit values.
  1316. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
  1317. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
  1318.   return (__m128i) __builtin_convertvector(
  1319.       __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
  1320. }
  1321.  
  1322. /* SSE4 Packed Integer Zero-Extension.  */
  1323. /// Zero-extends each of the lower eight 8-bit integer elements of a
  1324. ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
  1325. ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
  1326. ///    are unused.
  1327. ///
  1328. /// \headerfile <x86intrin.h>
  1329. ///
  1330. /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
  1331. ///
  1332. /// \param __V
  1333. ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
  1334. ///    zero-extended to 16-bit values.
  1335. /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
  1336. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
  1337.   return (__m128i) __builtin_convertvector(
  1338.       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
  1339.                               7),
  1340.       __v8hi);
  1341. }
  1342.  
  1343. /// Zero-extends each of the lower four 8-bit integer elements of a
  1344. ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
  1345. ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
  1346. ///    vector are unused.
  1347. ///
  1348. /// \headerfile <x86intrin.h>
  1349. ///
  1350. /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
  1351. ///
  1352. /// \param __V
  1353. ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
  1354. ///    zero-extended to 32-bit values.
  1355. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
  1356. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
  1357.   return (__m128i) __builtin_convertvector(
  1358.       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
  1359. }
  1360.  
  1361. /// Zero-extends each of the lower two 8-bit integer elements of a
  1362. ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
  1363. ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
  1364. ///    vector are unused.
  1365. ///
  1366. /// \headerfile <x86intrin.h>
  1367. ///
  1368. /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
  1369. ///
  1370. /// \param __V
  1371. ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
  1372. ///    zero-extended to 64-bit values.
  1373. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
  1374. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
  1375.   return (__m128i) __builtin_convertvector(
  1376.       __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
  1377. }
  1378.  
  1379. /// Zero-extends each of the lower four 16-bit integer elements of a
  1380. ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
  1381. ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
  1382. ///    vector are unused.
  1383. ///
  1384. /// \headerfile <x86intrin.h>
  1385. ///
  1386. /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
  1387. ///
  1388. /// \param __V
  1389. ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
  1390. ///    zero-extended to 32-bit values.
  1391. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
  1392. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
  1393.   return (__m128i) __builtin_convertvector(
  1394.       __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
  1395. }
  1396.  
  1397. /// Zero-extends each of the lower two 16-bit integer elements of a
  1398. ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
  1399. ///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
  1400. ///    are unused.
  1401. ///
  1402. /// \headerfile <x86intrin.h>
  1403. ///
  1404. /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
  1405. ///
  1406. /// \param __V
  1407. ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
  1408. ///    zero-extended to 64-bit values.
  1409. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
  1410. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
  1411.   return (__m128i) __builtin_convertvector(
  1412.       __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
  1413. }
  1414.  
  1415. /// Zero-extends each of the lower two 32-bit integer elements of a
  1416. ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
  1417. ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
  1418. ///    are unused.
  1419. ///
  1420. /// \headerfile <x86intrin.h>
  1421. ///
  1422. /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
  1423. ///
  1424. /// \param __V
  1425. ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
  1426. ///    zero-extended to 64-bit values.
  1427. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
  1428. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
  1429.   return (__m128i) __builtin_convertvector(
  1430.       __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
  1431. }
  1432.  
  1433. /* SSE4 Pack with Unsigned Saturation.  */
  1434. /// Converts 32-bit signed integers from both 128-bit integer vector
  1435. ///    operands into 16-bit unsigned integers, and returns the packed result.
  1436. ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
  1437. ///    0x0000 are saturated to 0x0000.
  1438. ///
  1439. /// \headerfile <x86intrin.h>
  1440. ///
  1441. /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
  1442. ///
  1443. /// \param __V1
  1444. ///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
  1445. ///    signed integer and is converted to a 16-bit unsigned integer with
  1446. ///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
  1447. ///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
  1448. ///    are written to the lower 64 bits of the result.
  1449. /// \param __V2
  1450. ///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
  1451. ///    signed integer and is converted to a 16-bit unsigned integer with
  1452. ///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
  1453. ///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
  1454. ///    are written to the higher 64 bits of the result.
  1455. /// \returns A 128-bit vector of [8 x i16] containing the converted values.
  1456. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
  1457.                                                               __m128i __V2) {
  1458.   return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
  1459. }
  1460.  
  1461. /* SSE4 Multiple Packed Sums of Absolute Difference.  */
  1462. /// Subtracts 8-bit unsigned integer values and computes the absolute
  1463. ///    values of the differences to the corresponding bits in the destination.
  1464. ///    Then sums of the absolute differences are returned according to the bit
  1465. ///    fields in the immediate operand.
  1466. ///
  1467. /// \headerfile <x86intrin.h>
  1468. ///
  1469. /// \code
  1470. /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
  1471. /// \endcode
  1472. ///
  1473. /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
  1474. ///
  1475. /// \param X
  1476. ///    A 128-bit vector of [16 x i8].
  1477. /// \param Y
  1478. ///    A 128-bit vector of [16 x i8].
  1479. /// \param M
  1480. ///    An 8-bit immediate operand specifying how the absolute differences are to
  1481. ///    be calculated, according to the following algorithm:
  1482. ///    \code
  1483. ///    // M2 represents bit 2 of the immediate operand
  1484. ///    // M10 represents bits [1:0] of the immediate operand
  1485. ///    i = M2 * 4;
  1486. ///    j = M10 * 4;
  1487. ///    for (k = 0; k < 8; k = k + 1) {
  1488. ///      d0 = abs(X[i + k + 0] - Y[j + 0]);
  1489. ///      d1 = abs(X[i + k + 1] - Y[j + 1]);
  1490. ///      d2 = abs(X[i + k + 2] - Y[j + 2]);
  1491. ///      d3 = abs(X[i + k + 3] - Y[j + 3]);
  1492. ///      r[k] = d0 + d1 + d2 + d3;
  1493. ///    }
  1494. ///    \endcode
  1495. /// \returns A 128-bit integer vector containing the sums of the sets of
  1496. ///    absolute differences between both operands.
  1497. #define _mm_mpsadbw_epu8(X, Y, M)                                              \
  1498.   ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
  1499.                                       (__v16qi)(__m128i)(Y), (M)))
  1500.  
  1501. /// Finds the minimum unsigned 16-bit element in the input 128-bit
  1502. ///    vector of [8 x u16] and returns it and along with its index.
  1503. ///
  1504. /// \headerfile <x86intrin.h>
  1505. ///
  1506. /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
  1507. /// instruction.
  1508. ///
  1509. /// \param __V
  1510. ///    A 128-bit vector of [8 x u16].
  1511. /// \returns A 128-bit value where bits [15:0] contain the minimum value found
  1512. ///    in parameter \a __V, bits [18:16] contain the index of the minimum value
  1513. ///    and the remaining bits are set to 0.
  1514. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
  1515.   return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
  1516. }
  1517.  
  1518. /* Handle the sse4.2 definitions here. */
  1519.  
  1520. /* These definitions are normally in nmmintrin.h, but gcc puts them in here
  1521.    so we'll do the same.  */
  1522.  
  1523. #undef __DEFAULT_FN_ATTRS
  1524. #define __DEFAULT_FN_ATTRS                                                     \
  1525.   __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
  1526.  
  1527. /* These specify the type of data that we're comparing.  */
  1528. #define _SIDD_UBYTE_OPS 0x00
  1529. #define _SIDD_UWORD_OPS 0x01
  1530. #define _SIDD_SBYTE_OPS 0x02
  1531. #define _SIDD_SWORD_OPS 0x03
  1532.  
  1533. /* These specify the type of comparison operation.  */
  1534. #define _SIDD_CMP_EQUAL_ANY 0x00
  1535. #define _SIDD_CMP_RANGES 0x04
  1536. #define _SIDD_CMP_EQUAL_EACH 0x08
  1537. #define _SIDD_CMP_EQUAL_ORDERED 0x0c
  1538.  
  1539. /* These macros specify the polarity of the operation.  */
  1540. #define _SIDD_POSITIVE_POLARITY 0x00
  1541. #define _SIDD_NEGATIVE_POLARITY 0x10
  1542. #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
  1543. #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
  1544.  
  1545. /* These macros are used in _mm_cmpXstri() to specify the return.  */
  1546. #define _SIDD_LEAST_SIGNIFICANT 0x00
  1547. #define _SIDD_MOST_SIGNIFICANT 0x40
  1548.  
  1549. /* These macros are used in _mm_cmpXstri() to specify the return.  */
  1550. #define _SIDD_BIT_MASK 0x00
  1551. #define _SIDD_UNIT_MASK 0x40
  1552.  
  1553. /* SSE4.2 Packed Comparison Intrinsics.  */
  1554. /// Uses the immediate operand \a M to perform a comparison of string
  1555. ///    data with implicitly defined lengths that is contained in source operands
  1556. ///    \a A and \a B. Returns a 128-bit integer vector representing the result
  1557. ///    mask of the comparison.
  1558. ///
  1559. /// \headerfile <x86intrin.h>
  1560. ///
  1561. /// \code
  1562. /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
  1563. /// \endcode
  1564. ///
  1565. /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
  1566. /// instruction.
  1567. ///
  1568. /// \param A
  1569. ///    A 128-bit integer vector containing one of the source operands to be
  1570. ///    compared.
  1571. /// \param B
  1572. ///    A 128-bit integer vector containing one of the source operands to be
  1573. ///    compared.
  1574. /// \param M
  1575. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1576. ///    words, the type of comparison to perform, and the format of the return
  1577. ///    value. \n
  1578. ///    Bits [1:0]: Determine source data format. \n
  1579. ///      00: 16 unsigned bytes \n
  1580. ///      01: 8 unsigned words \n
  1581. ///      10: 16 signed bytes \n
  1582. ///      11: 8 signed words \n
  1583. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1584. ///      00: Subset: Each character in \a B is compared for equality with all
  1585. ///          the characters in \a A. \n
  1586. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1587. ///          basis is greater than or equal for even-indexed elements in \a A,
  1588. ///          and less than or equal for odd-indexed elements in \a A. \n
  1589. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1590. ///          \a B for equality. \n
  1591. ///      11: Substring: Search \a B for substring matches of \a A. \n
  1592. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1593. ///                mask of the comparison results. \n
  1594. ///      00: No effect. \n
  1595. ///      01: Negate the bit mask. \n
  1596. ///      10: No effect. \n
  1597. ///      11: Negate the bit mask only for bits with an index less than or equal
  1598. ///          to the size of \a A or \a B. \n
  1599. ///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
  1600. ///             bytes. \n
  1601. ///      0: The result is zero-extended to 16 bytes. \n
  1602. ///      1: The result is expanded to 16 bytes (this expansion is performed by
  1603. ///         repeating each bit 8 or 16 times).
  1604. /// \returns Returns a 128-bit integer vector representing the result mask of
  1605. ///    the comparison.
  1606. #define _mm_cmpistrm(A, B, M)                                                  \
  1607.   ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
  1608.                                         (__v16qi)(__m128i)(B), (int)(M)))
  1609.  
  1610. /// Uses the immediate operand \a M to perform a comparison of string
  1611. ///    data with implicitly defined lengths that is contained in source operands
  1612. ///    \a A and \a B. Returns an integer representing the result index of the
  1613. ///    comparison.
  1614. ///
  1615. /// \headerfile <x86intrin.h>
  1616. ///
  1617. /// \code
  1618. /// int _mm_cmpistri(__m128i A, __m128i B, const int M);
  1619. /// \endcode
  1620. ///
  1621. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1622. /// instruction.
  1623. ///
  1624. /// \param A
  1625. ///    A 128-bit integer vector containing one of the source operands to be
  1626. ///    compared.
  1627. /// \param B
  1628. ///    A 128-bit integer vector containing one of the source operands to be
  1629. ///    compared.
  1630. /// \param M
  1631. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1632. ///    words, the type of comparison to perform, and the format of the return
  1633. ///    value. \n
  1634. ///    Bits [1:0]: Determine source data format. \n
  1635. ///      00: 16 unsigned bytes \n
  1636. ///      01: 8 unsigned words \n
  1637. ///      10: 16 signed bytes \n
  1638. ///      11: 8 signed words \n
  1639. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1640. ///      00: Subset: Each character in \a B is compared for equality with all
  1641. ///          the characters in \a A. \n
  1642. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1643. ///          basis is greater than or equal for even-indexed elements in \a A,
  1644. ///          and less than or equal for odd-indexed elements in \a A. \n
  1645. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1646. ///          \a B for equality. \n
  1647. ///      11: Substring: Search B for substring matches of \a A. \n
  1648. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1649. ///                mask of the comparison results. \n
  1650. ///      00: No effect. \n
  1651. ///      01: Negate the bit mask. \n
  1652. ///      10: No effect. \n
  1653. ///      11: Negate the bit mask only for bits with an index less than or equal
  1654. ///          to the size of \a A or \a B. \n
  1655. ///    Bit [6]: Determines whether the index of the lowest set bit or the
  1656. ///             highest set bit is returned. \n
  1657. ///      0: The index of the least significant set bit. \n
  1658. ///      1: The index of the most significant set bit. \n
  1659. /// \returns Returns an integer representing the result index of the comparison.
  1660. #define _mm_cmpistri(A, B, M)                                                  \
  1661.   ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
  1662.                                     (__v16qi)(__m128i)(B), (int)(M)))
  1663.  
  1664. /// Uses the immediate operand \a M to perform a comparison of string
  1665. ///    data with explicitly defined lengths that is contained in source operands
  1666. ///    \a A and \a B. Returns a 128-bit integer vector representing the result
  1667. ///    mask of the comparison.
  1668. ///
  1669. /// \headerfile <x86intrin.h>
  1670. ///
  1671. /// \code
  1672. /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
  1673. /// \endcode
  1674. ///
  1675. /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
  1676. /// instruction.
  1677. ///
  1678. /// \param A
  1679. ///    A 128-bit integer vector containing one of the source operands to be
  1680. ///    compared.
  1681. /// \param LA
  1682. ///    An integer that specifies the length of the string in \a A.
  1683. /// \param B
  1684. ///    A 128-bit integer vector containing one of the source operands to be
  1685. ///    compared.
  1686. /// \param LB
  1687. ///    An integer that specifies the length of the string in \a B.
  1688. /// \param M
  1689. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1690. ///    words, the type of comparison to perform, and the format of the return
  1691. ///    value. \n
  1692. ///    Bits [1:0]: Determine source data format. \n
  1693. ///      00: 16 unsigned bytes \n
  1694. ///      01: 8 unsigned words \n
  1695. ///      10: 16 signed bytes \n
  1696. ///      11: 8 signed words \n
  1697. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1698. ///      00: Subset: Each character in \a B is compared for equality with all
  1699. ///          the characters in \a A. \n
  1700. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1701. ///          basis is greater than or equal for even-indexed elements in \a A,
  1702. ///          and less than or equal for odd-indexed elements in \a A. \n
  1703. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1704. ///          \a B for equality. \n
  1705. ///      11: Substring: Search \a B for substring matches of \a A. \n
  1706. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1707. ///                mask of the comparison results. \n
  1708. ///      00: No effect. \n
  1709. ///      01: Negate the bit mask. \n
  1710. ///      10: No effect. \n
  1711. ///      11: Negate the bit mask only for bits with an index less than or equal
  1712. ///          to the size of \a A or \a B. \n
  1713. ///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
  1714. ///             bytes. \n
  1715. ///      0: The result is zero-extended to 16 bytes. \n
  1716. ///      1: The result is expanded to 16 bytes (this expansion is performed by
  1717. ///         repeating each bit 8 or 16 times). \n
  1718. /// \returns Returns a 128-bit integer vector representing the result mask of
  1719. ///    the comparison.
  1720. #define _mm_cmpestrm(A, LA, B, LB, M)                                          \
  1721.   ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
  1722.                                         (__v16qi)(__m128i)(B), (int)(LB),      \
  1723.                                         (int)(M)))
  1724.  
  1725. /// Uses the immediate operand \a M to perform a comparison of string
  1726. ///    data with explicitly defined lengths that is contained in source operands
  1727. ///    \a A and \a B. Returns an integer representing the result index of the
  1728. ///    comparison.
  1729. ///
  1730. /// \headerfile <x86intrin.h>
  1731. ///
  1732. /// \code
  1733. /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
  1734. /// \endcode
  1735. ///
  1736. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  1737. /// instruction.
  1738. ///
  1739. /// \param A
  1740. ///    A 128-bit integer vector containing one of the source operands to be
  1741. ///    compared.
  1742. /// \param LA
  1743. ///    An integer that specifies the length of the string in \a A.
  1744. /// \param B
  1745. ///    A 128-bit integer vector containing one of the source operands to be
  1746. ///    compared.
  1747. /// \param LB
  1748. ///    An integer that specifies the length of the string in \a B.
  1749. /// \param M
  1750. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1751. ///    words, the type of comparison to perform, and the format of the return
  1752. ///    value. \n
  1753. ///    Bits [1:0]: Determine source data format. \n
  1754. ///      00: 16 unsigned bytes \n
  1755. ///      01: 8 unsigned words \n
  1756. ///      10: 16 signed bytes \n
  1757. ///      11: 8 signed words \n
  1758. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1759. ///      00: Subset: Each character in \a B is compared for equality with all
  1760. ///          the characters in \a A. \n
  1761. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1762. ///          basis is greater than or equal for even-indexed elements in \a A,
  1763. ///          and less than or equal for odd-indexed elements in \a A. \n
  1764. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1765. ///          \a B for equality. \n
  1766. ///      11: Substring: Search B for substring matches of \a A. \n
  1767. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1768. ///                mask of the comparison results. \n
  1769. ///      00: No effect. \n
  1770. ///      01: Negate the bit mask. \n
  1771. ///      10: No effect. \n
  1772. ///      11: Negate the bit mask only for bits with an index less than or equal
  1773. ///          to the size of \a A or \a B. \n
  1774. ///    Bit [6]: Determines whether the index of the lowest set bit or the
  1775. ///             highest set bit is returned. \n
  1776. ///      0: The index of the least significant set bit. \n
  1777. ///      1: The index of the most significant set bit. \n
  1778. /// \returns Returns an integer representing the result index of the comparison.
  1779. #define _mm_cmpestri(A, LA, B, LB, M)                                          \
  1780.   ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
  1781.                                     (__v16qi)(__m128i)(B), (int)(LB),          \
  1782.                                     (int)(M)))
  1783.  
  1784. /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
  1785. /// Uses the immediate operand \a M to perform a comparison of string
  1786. ///    data with implicitly defined lengths that is contained in source operands
  1787. ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
  1788. ///    string in \a B is the maximum, otherwise, returns 0.
  1789. ///
  1790. /// \headerfile <x86intrin.h>
  1791. ///
  1792. /// \code
  1793. /// int _mm_cmpistra(__m128i A, __m128i B, const int M);
  1794. /// \endcode
  1795. ///
  1796. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1797. /// instruction.
  1798. ///
  1799. /// \param A
  1800. ///    A 128-bit integer vector containing one of the source operands to be
  1801. ///    compared.
  1802. /// \param B
  1803. ///    A 128-bit integer vector containing one of the source operands to be
  1804. ///    compared.
  1805. /// \param M
  1806. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1807. ///    words and the type of comparison to perform. \n
  1808. ///    Bits [1:0]: Determine source data format. \n
  1809. ///      00: 16 unsigned bytes \n
  1810. ///      01: 8 unsigned words \n
  1811. ///      10: 16 signed bytes \n
  1812. ///      11: 8 signed words \n
  1813. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1814. ///      00: Subset: Each character in \a B is compared for equality with all
  1815. ///          the characters in \a A. \n
  1816. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1817. ///          basis is greater than or equal for even-indexed elements in \a A,
  1818. ///          and less than or equal for odd-indexed elements in \a A. \n
  1819. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1820. ///          \a B for equality. \n
  1821. ///      11: Substring: Search \a B for substring matches of \a A. \n
  1822. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1823. ///                mask of the comparison results. \n
  1824. ///      00: No effect. \n
  1825. ///      01: Negate the bit mask. \n
  1826. ///      10: No effect. \n
  1827. ///      11: Negate the bit mask only for bits with an index less than or equal
  1828. ///          to the size of \a A or \a B. \n
  1829. /// \returns Returns 1 if the bit mask is zero and the length of the string in
  1830. ///    \a B is the maximum; otherwise, returns 0.
  1831. #define _mm_cmpistra(A, B, M)                                                  \
  1832.   ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
  1833.                                      (__v16qi)(__m128i)(B), (int)(M)))
  1834.  
  1835. /// Uses the immediate operand \a M to perform a comparison of string
  1836. ///    data with implicitly defined lengths that is contained in source operands
  1837. ///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
  1838. ///    0.
  1839. ///
  1840. /// \headerfile <x86intrin.h>
  1841. ///
  1842. /// \code
  1843. /// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
  1844. /// \endcode
  1845. ///
  1846. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1847. /// instruction.
  1848. ///
  1849. /// \param A
  1850. ///    A 128-bit integer vector containing one of the source operands to be
  1851. ///    compared.
  1852. /// \param B
  1853. ///    A 128-bit integer vector containing one of the source operands to be
  1854. ///    compared.
  1855. /// \param M
  1856. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1857. ///    words and the type of comparison to perform. \n
  1858. ///    Bits [1:0]: Determine source data format. \n
  1859. ///      00: 16 unsigned bytes \n
  1860. ///      01: 8 unsigned words \n
  1861. ///      10: 16 signed bytes \n
  1862. ///      11: 8 signed words \n
  1863. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1864. ///      00: Subset: Each character in \a B is compared for equality with all
  1865. ///          the characters in \a A. \n
  1866. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1867. ///          basis is greater than or equal for even-indexed elements in \a A,
  1868. ///          and less than or equal for odd-indexed elements in \a A. \n
  1869. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1870. ///          \a B for equality. \n
  1871. ///      11: Substring: Search B for substring matches of \a A. \n
  1872. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1873. ///                mask of the comparison results. \n
  1874. ///      00: No effect. \n
  1875. ///      01: Negate the bit mask. \n
  1876. ///      10: No effect. \n
  1877. ///      11: Negate the bit mask only for bits with an index less than or equal
  1878. ///          to the size of \a A or \a B.
  1879. /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
  1880. #define _mm_cmpistrc(A, B, M)                                                  \
  1881.   ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
  1882.                                      (__v16qi)(__m128i)(B), (int)(M)))
  1883.  
  1884. /// Uses the immediate operand \a M to perform a comparison of string
  1885. ///    data with implicitly defined lengths that is contained in source operands
  1886. ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
  1887. ///
  1888. /// \headerfile <x86intrin.h>
  1889. ///
  1890. /// \code
  1891. /// int _mm_cmpistro(__m128i A, __m128i B, const int M);
  1892. /// \endcode
  1893. ///
  1894. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1895. /// instruction.
  1896. ///
  1897. /// \param A
  1898. ///    A 128-bit integer vector containing one of the source operands to be
  1899. ///    compared.
  1900. /// \param B
  1901. ///    A 128-bit integer vector containing one of the source operands to be
  1902. ///    compared.
  1903. /// \param M
  1904. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1905. ///    words and the type of comparison to perform. \n
  1906. ///    Bits [1:0]: Determine source data format. \n
  1907. ///      00: 16 unsigned bytes \n
  1908. ///      01: 8 unsigned words \n
  1909. ///      10: 16 signed bytes \n
  1910. ///      11: 8 signed words \n
  1911. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1912. ///      00: Subset: Each character in \a B is compared for equality with all
  1913. ///          the characters in \a A. \n
  1914. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1915. ///          basis is greater than or equal for even-indexed elements in \a A,
  1916. ///          and less than or equal for odd-indexed elements in \a A. \n
  1917. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1918. ///          \a B for equality. \n
  1919. ///      11: Substring: Search B for substring matches of \a A. \n
  1920. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1921. ///                mask of the comparison results. \n
  1922. ///      00: No effect. \n
  1923. ///      01: Negate the bit mask. \n
  1924. ///      10: No effect. \n
  1925. ///      11: Negate the bit mask only for bits with an index less than or equal
  1926. ///          to the size of \a A or \a B. \n
  1927. /// \returns Returns bit 0 of the resulting bit mask.
  1928. #define _mm_cmpistro(A, B, M)                                                  \
  1929.   ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
  1930.                                      (__v16qi)(__m128i)(B), (int)(M)))
  1931.  
  1932. /// Uses the immediate operand \a M to perform a comparison of string
  1933. ///    data with implicitly defined lengths that is contained in source operands
  1934. ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
  1935. ///    the maximum, otherwise, returns 0.
  1936. ///
  1937. /// \headerfile <x86intrin.h>
  1938. ///
  1939. /// \code
  1940. /// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
  1941. /// \endcode
  1942. ///
  1943. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1944. /// instruction.
  1945. ///
  1946. /// \param A
  1947. ///    A 128-bit integer vector containing one of the source operands to be
  1948. ///    compared.
  1949. /// \param B
  1950. ///    A 128-bit integer vector containing one of the source operands to be
  1951. ///    compared.
  1952. /// \param M
  1953. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  1954. ///    words and the type of comparison to perform. \n
  1955. ///    Bits [1:0]: Determine source data format. \n
  1956. ///      00: 16 unsigned bytes \n
  1957. ///      01: 8 unsigned words \n
  1958. ///      10: 16 signed bytes \n
  1959. ///      11: 8 signed words \n
  1960. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  1961. ///      00: Subset: Each character in \a B is compared for equality with all
  1962. ///          the characters in \a A. \n
  1963. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  1964. ///          basis is greater than or equal for even-indexed elements in \a A,
  1965. ///          and less than or equal for odd-indexed elements in \a A. \n
  1966. ///      10: Match: Compare each pair of corresponding characters in \a A and
  1967. ///          \a B for equality. \n
  1968. ///      11: Substring: Search \a B for substring matches of \a A. \n
  1969. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  1970. ///                mask of the comparison results. \n
  1971. ///      00: No effect. \n
  1972. ///      01: Negate the bit mask. \n
  1973. ///      10: No effect. \n
  1974. ///      11: Negate the bit mask only for bits with an index less than or equal
  1975. ///          to the size of \a A or \a B. \n
  1976. /// \returns Returns 1 if the length of the string in \a A is less than the
  1977. ///    maximum, otherwise, returns 0.
  1978. #define _mm_cmpistrs(A, B, M)                                                  \
  1979.   ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
  1980.                                      (__v16qi)(__m128i)(B), (int)(M)))
  1981.  
  1982. /// Uses the immediate operand \a M to perform a comparison of string
  1983. ///    data with implicitly defined lengths that is contained in source operands
  1984. ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
  1985. ///    the maximum, otherwise, returns 0.
  1986. ///
  1987. /// \headerfile <x86intrin.h>
  1988. ///
  1989. /// \code
  1990. /// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
  1991. /// \endcode
  1992. ///
  1993. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1994. /// instruction.
  1995. ///
  1996. /// \param A
  1997. ///    A 128-bit integer vector containing one of the source operands to be
  1998. ///    compared.
  1999. /// \param B
  2000. ///    A 128-bit integer vector containing one of the source operands to be
  2001. ///    compared.
  2002. /// \param M
  2003. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  2004. ///    words and the type of comparison to perform. \n
  2005. ///    Bits [1:0]: Determine source data format. \n
  2006. ///      00: 16 unsigned bytes \n
  2007. ///      01: 8 unsigned words \n
  2008. ///      10: 16 signed bytes \n
  2009. ///      11: 8 signed words \n
  2010. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  2011. ///      00: Subset: Each character in \a B is compared for equality with all
  2012. ///          the characters in \a A. \n
  2013. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  2014. ///          basis is greater than or equal for even-indexed elements in \a A,
  2015. ///          and less than or equal for odd-indexed elements in \a A. \n
  2016. ///      10: Match: Compare each pair of corresponding characters in \a A and
  2017. ///          \a B for equality. \n
  2018. ///      11: Substring: Search \a B for substring matches of \a A. \n
  2019. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  2020. ///                mask of the comparison results. \n
  2021. ///      00: No effect. \n
  2022. ///      01: Negate the bit mask. \n
  2023. ///      10: No effect. \n
  2024. ///      11: Negate the bit mask only for bits with an index less than or equal
  2025. ///          to the size of \a A or \a B.
  2026. /// \returns Returns 1 if the length of the string in \a B is less than the
  2027. ///    maximum, otherwise, returns 0.
  2028. #define _mm_cmpistrz(A, B, M)                                                  \
  2029.   ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
  2030.                                      (__v16qi)(__m128i)(B), (int)(M)))
  2031.  
  2032. /// Uses the immediate operand \a M to perform a comparison of string
  2033. ///    data with explicitly defined lengths that is contained in source operands
  2034. ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
  2035. ///    string in \a B is the maximum, otherwise, returns 0.
  2036. ///
  2037. /// \headerfile <x86intrin.h>
  2038. ///
  2039. /// \code
  2040. /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
  2041. /// \endcode
  2042. ///
  2043. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2044. /// instruction.
  2045. ///
  2046. /// \param A
  2047. ///    A 128-bit integer vector containing one of the source operands to be
  2048. ///    compared.
  2049. /// \param LA
  2050. ///    An integer that specifies the length of the string in \a A.
  2051. /// \param B
  2052. ///    A 128-bit integer vector containing one of the source operands to be
  2053. ///    compared.
  2054. /// \param LB
  2055. ///    An integer that specifies the length of the string in \a B.
  2056. /// \param M
  2057. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  2058. ///    words and the type of comparison to perform. \n
  2059. ///    Bits [1:0]: Determine source data format. \n
  2060. ///      00: 16 unsigned bytes \n
  2061. ///      01: 8 unsigned words \n
  2062. ///      10: 16 signed bytes \n
  2063. ///      11: 8 signed words \n
  2064. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  2065. ///      00: Subset: Each character in \a B is compared for equality with all
  2066. ///          the characters in \a A. \n
  2067. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  2068. ///          basis is greater than or equal for even-indexed elements in \a A,
  2069. ///          and less than or equal for odd-indexed elements in \a A. \n
  2070. ///      10: Match: Compare each pair of corresponding characters in \a A and
  2071. ///          \a B for equality. \n
  2072. ///      11: Substring: Search \a B for substring matches of \a A. \n
  2073. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  2074. ///                mask of the comparison results. \n
  2075. ///      00: No effect. \n
  2076. ///      01: Negate the bit mask. \n
  2077. ///      10: No effect. \n
  2078. ///      11: Negate the bit mask only for bits with an index less than or equal
  2079. ///          to the size of \a A or \a B.
  2080. /// \returns Returns 1 if the bit mask is zero and the length of the string in
  2081. ///    \a B is the maximum, otherwise, returns 0.
  2082. #define _mm_cmpestra(A, LA, B, LB, M)                                          \
  2083.   ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
  2084.                                      (__v16qi)(__m128i)(B), (int)(LB),         \
  2085.                                      (int)(M)))
  2086.  
  2087. /// Uses the immediate operand \a M to perform a comparison of string
  2088. ///    data with explicitly defined lengths that is contained in source operands
  2089. ///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
  2090. ///    returns 0.
  2091. ///
  2092. /// \headerfile <x86intrin.h>
  2093. ///
  2094. /// \code
  2095. /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
  2096. /// \endcode
  2097. ///
  2098. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2099. /// instruction.
  2100. ///
  2101. /// \param A
  2102. ///    A 128-bit integer vector containing one of the source operands to be
  2103. ///    compared.
  2104. /// \param LA
  2105. ///    An integer that specifies the length of the string in \a A.
  2106. /// \param B
  2107. ///    A 128-bit integer vector containing one of the source operands to be
  2108. ///    compared.
  2109. /// \param LB
  2110. ///    An integer that specifies the length of the string in \a B.
  2111. /// \param M
  2112. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  2113. ///    words and the type of comparison to perform. \n
  2114. ///    Bits [1:0]: Determine source data format. \n
  2115. ///      00: 16 unsigned bytes \n
  2116. ///      01: 8 unsigned words \n
  2117. ///      10: 16 signed bytes \n
  2118. ///      11: 8 signed words \n
  2119. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  2120. ///      00: Subset: Each character in \a B is compared for equality with all
  2121. ///          the characters in \a A. \n
  2122. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  2123. ///          basis is greater than or equal for even-indexed elements in \a A,
  2124. ///          and less than or equal for odd-indexed elements in \a A. \n
  2125. ///      10: Match: Compare each pair of corresponding characters in \a A and
  2126. ///          \a B for equality. \n
  2127. ///      11: Substring: Search \a B for substring matches of \a A. \n
  2128. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  2129. ///                mask of the comparison results. \n
  2130. ///      00: No effect. \n
  2131. ///      01: Negate the bit mask. \n
  2132. ///      10: No effect. \n
  2133. ///      11: Negate the bit mask only for bits with an index less than or equal
  2134. ///          to the size of \a A or \a B. \n
  2135. /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
  2136. #define _mm_cmpestrc(A, LA, B, LB, M)                                          \
  2137.   ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
  2138.                                      (__v16qi)(__m128i)(B), (int)(LB),         \
  2139.                                      (int)(M)))
  2140.  
  2141. /// Uses the immediate operand \a M to perform a comparison of string
  2142. ///    data with explicitly defined lengths that is contained in source operands
  2143. ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
  2144. ///
  2145. /// \headerfile <x86intrin.h>
  2146. ///
  2147. /// \code
  2148. /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
  2149. /// \endcode
  2150. ///
  2151. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2152. /// instruction.
  2153. ///
  2154. /// \param A
  2155. ///    A 128-bit integer vector containing one of the source operands to be
  2156. ///    compared.
  2157. /// \param LA
  2158. ///    An integer that specifies the length of the string in \a A.
  2159. /// \param B
  2160. ///    A 128-bit integer vector containing one of the source operands to be
  2161. ///    compared.
  2162. /// \param LB
  2163. ///    An integer that specifies the length of the string in \a B.
  2164. /// \param M
  2165. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  2166. ///    words and the type of comparison to perform. \n
  2167. ///    Bits [1:0]: Determine source data format. \n
  2168. ///      00: 16 unsigned bytes \n
  2169. ///      01: 8 unsigned words \n
  2170. ///      10: 16 signed bytes \n
  2171. ///      11: 8 signed words \n
  2172. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  2173. ///      00: Subset: Each character in \a B is compared for equality with all
  2174. ///          the characters in \a A. \n
  2175. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  2176. ///          basis is greater than or equal for even-indexed elements in \a A,
  2177. ///          and less than or equal for odd-indexed elements in \a A. \n
  2178. ///      10: Match: Compare each pair of corresponding characters in \a A and
  2179. ///          \a B for equality. \n
  2180. ///      11: Substring: Search \a B for substring matches of \a A. \n
  2181. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  2182. ///                mask of the comparison results. \n
  2183. ///      00: No effect. \n
  2184. ///      01: Negate the bit mask. \n
  2185. ///      10: No effect. \n
  2186. ///      11: Negate the bit mask only for bits with an index less than or equal
  2187. ///          to the size of \a A or \a B.
  2188. /// \returns Returns bit 0 of the resulting bit mask.
  2189. #define _mm_cmpestro(A, LA, B, LB, M)                                          \
  2190.   ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
  2191.                                      (__v16qi)(__m128i)(B), (int)(LB),         \
  2192.                                      (int)(M)))
  2193.  
  2194. /// Uses the immediate operand \a M to perform a comparison of string
  2195. ///    data with explicitly defined lengths that is contained in source operands
  2196. ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
  2197. ///    the maximum, otherwise, returns 0.
  2198. ///
  2199. /// \headerfile <x86intrin.h>
  2200. ///
  2201. /// \code
  2202. /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
  2203. /// \endcode
  2204. ///
  2205. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2206. /// instruction.
  2207. ///
  2208. /// \param A
  2209. ///    A 128-bit integer vector containing one of the source operands to be
  2210. ///    compared.
  2211. /// \param LA
  2212. ///    An integer that specifies the length of the string in \a A.
  2213. /// \param B
  2214. ///    A 128-bit integer vector containing one of the source operands to be
  2215. ///    compared.
  2216. /// \param LB
  2217. ///    An integer that specifies the length of the string in \a B.
  2218. /// \param M
  2219. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  2220. ///    words and the type of comparison to perform. \n
  2221. ///    Bits [1:0]: Determine source data format. \n
  2222. ///      00: 16 unsigned bytes \n
  2223. ///      01: 8 unsigned words \n
  2224. ///      10: 16 signed bytes \n
  2225. ///      11: 8 signed words \n
  2226. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  2227. ///      00: Subset: Each character in \a B is compared for equality with all
  2228. ///          the characters in \a A. \n
  2229. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  2230. ///          basis is greater than or equal for even-indexed elements in \a A,
  2231. ///          and less than or equal for odd-indexed elements in \a A. \n
  2232. ///      10: Match: Compare each pair of corresponding characters in \a A and
  2233. ///          \a B for equality. \n
  2234. ///      11: Substring: Search \a B for substring matches of \a A. \n
  2235. ///    Bits [5:4]: Determine whether to perform a one's complement in the bit
  2236. ///                mask of the comparison results. \n
  2237. ///      00: No effect. \n
  2238. ///      01: Negate the bit mask. \n
  2239. ///      10: No effect. \n
  2240. ///      11: Negate the bit mask only for bits with an index less than or equal
  2241. ///          to the size of \a A or \a B. \n
  2242. /// \returns Returns 1 if the length of the string in \a A is less than the
  2243. ///    maximum, otherwise, returns 0.
  2244. #define _mm_cmpestrs(A, LA, B, LB, M)                                          \
  2245.   ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
  2246.                                      (__v16qi)(__m128i)(B), (int)(LB),         \
  2247.                                      (int)(M)))
  2248.  
  2249. /// Uses the immediate operand \a M to perform a comparison of string
  2250. ///    data with explicitly defined lengths that is contained in source operands
  2251. ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
  2252. ///    the maximum, otherwise, returns 0.
  2253. ///
  2254. /// \headerfile <x86intrin.h>
  2255. ///
  2256. /// \code
  2257. /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
  2258. /// \endcode
  2259. ///
  2260. /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
  2261. ///
  2262. /// \param A
  2263. ///    A 128-bit integer vector containing one of the source operands to be
  2264. ///    compared.
  2265. /// \param LA
  2266. ///    An integer that specifies the length of the string in \a A.
  2267. /// \param B
  2268. ///    A 128-bit integer vector containing one of the source operands to be
  2269. ///    compared.
  2270. /// \param LB
  2271. ///    An integer that specifies the length of the string in \a B.
  2272. /// \param M
  2273. ///    An 8-bit immediate operand specifying whether the characters are bytes or
  2274. ///    words and the type of comparison to perform. \n
  2275. ///    Bits [1:0]: Determine source data format. \n
  2276. ///      00: 16 unsigned bytes  \n
  2277. ///      01: 8 unsigned words \n
  2278. ///      10: 16 signed bytes \n
  2279. ///      11: 8 signed words \n
  2280. ///    Bits [3:2]: Determine comparison type and aggregation method. \n
  2281. ///      00: Subset: Each character in \a B is compared for equality with all
  2282. ///          the characters in \a A. \n
  2283. ///      01: Ranges: Each character in \a B is compared to \a A. The comparison
  2284. ///          basis is greater than or equal for even-indexed elements in \a A,
  2285. ///          and less than or equal for odd-indexed elements in \a A. \n
  2286. ///      10: Match: Compare each pair of corresponding characters in \a A and
  2287. ///          \a B for equality. \n
  2288. ///      11: Substring: Search \a B for substring matches of \a A. \n
  2289. ///    Bits [5:4]: Determine whether to perform a one's complement on the bit
  2290. ///                mask of the comparison results. \n
  2291. ///      00: No effect. \n
  2292. ///      01: Negate the bit mask. \n
  2293. ///      10: No effect. \n
  2294. ///      11: Negate the bit mask only for bits with an index less than or equal
  2295. ///          to the size of \a A or \a B.
  2296. /// \returns Returns 1 if the length of the string in \a B is less than the
  2297. ///    maximum, otherwise, returns 0.
  2298. #define _mm_cmpestrz(A, LA, B, LB, M)                                          \
  2299.   ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
  2300.                                      (__v16qi)(__m128i)(B), (int)(LB),         \
  2301.                                      (int)(M)))
  2302.  
  2303. /* SSE4.2 Compare Packed Data -- Greater Than.  */
  2304. /// Compares each of the corresponding 64-bit values of the 128-bit
  2305. ///    integer vectors to determine if the values in the first operand are
  2306. ///    greater than those in the second operand.
  2307. ///
  2308. /// \headerfile <x86intrin.h>
  2309. ///
  2310. /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
  2311. ///
  2312. /// \param __V1
  2313. ///    A 128-bit integer vector.
  2314. /// \param __V2
  2315. ///    A 128-bit integer vector.
  2316. /// \returns A 128-bit integer vector containing the comparison results.
  2317. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
  2318.                                                              __m128i __V2) {
  2319.   return (__m128i)((__v2di)__V1 > (__v2di)__V2);
  2320. }
  2321.  
  2322. #undef __DEFAULT_FN_ATTRS
  2323.  
  2324. #include <popcntintrin.h>
  2325.  
  2326. #include <crc32intrin.h>
  2327.  
  2328. #endif /* __SMMINTRIN_H */
  2329.