Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __XMMINTRIN_H
  11. #define __XMMINTRIN_H
  12.  
  13. #if !defined(__i386__) && !defined(__x86_64__)
  14. #error "This header is only meant to be used on x86 and x64 architecture"
  15. #endif
  16.  
  17. #include <mmintrin.h>
  18.  
  19. typedef int __v4si __attribute__((__vector_size__(16)));
  20. typedef float __v4sf __attribute__((__vector_size__(16)));
  21. typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
  22.  
  23. typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
  24.  
  25. /* Unsigned types */
  26. typedef unsigned int __v4su __attribute__((__vector_size__(16)));
  27.  
  28. /* This header should only be included in a hosted environment as it depends on
  29.  * a standard library to provide allocation routines. */
  30. #if __STDC_HOSTED__
  31. #include <mm_malloc.h>
  32. #endif
  33.  
  34. /* Define the default attributes for the functions in this file. */
  35. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
  36. #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
  37.  
  38. /// Adds the 32-bit float values in the low-order bits of the operands.
  39. ///
  40. /// \headerfile <x86intrin.h>
  41. ///
  42. /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
  43. ///
  44. /// \param __a
  45. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  46. ///    The lower 32 bits of this operand are used in the calculation.
  47. /// \param __b
  48. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  49. ///    The lower 32 bits of this operand are used in the calculation.
  50. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
  51. ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
  52. ///    the upper 96 bits of the first source operand.
  53. static __inline__ __m128 __DEFAULT_FN_ATTRS
  54. _mm_add_ss(__m128 __a, __m128 __b)
  55. {
  56.   __a[0] += __b[0];
  57.   return __a;
  58. }
  59.  
  60. /// Adds two 128-bit vectors of [4 x float], and returns the results of
  61. ///    the addition.
  62. ///
  63. /// \headerfile <x86intrin.h>
  64. ///
  65. /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
  66. ///
  67. /// \param __a
  68. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  69. /// \param __b
  70. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  71. /// \returns A 128-bit vector of [4 x float] containing the sums of both
  72. ///    operands.
  73. static __inline__ __m128 __DEFAULT_FN_ATTRS
  74. _mm_add_ps(__m128 __a, __m128 __b)
  75. {
  76.   return (__m128)((__v4sf)__a + (__v4sf)__b);
  77. }
  78.  
  79. /// Subtracts the 32-bit float value in the low-order bits of the second
  80. ///    operand from the corresponding value in the first operand.
  81. ///
  82. /// \headerfile <x86intrin.h>
  83. ///
  84. /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
  85. ///
  86. /// \param __a
  87. ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
  88. ///    of this operand are used in the calculation.
  89. /// \param __b
  90. ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
  91. ///    bits of this operand are used in the calculation.
  92. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  93. ///    difference of the lower 32 bits of both operands. The upper 96 bits are
  94. ///    copied from the upper 96 bits of the first source operand.
  95. static __inline__ __m128 __DEFAULT_FN_ATTRS
  96. _mm_sub_ss(__m128 __a, __m128 __b)
  97. {
  98.   __a[0] -= __b[0];
  99.   return __a;
  100. }
  101.  
  102. /// Subtracts each of the values of the second operand from the first
  103. ///    operand, both of which are 128-bit vectors of [4 x float] and returns
  104. ///    the results of the subtraction.
  105. ///
  106. /// \headerfile <x86intrin.h>
  107. ///
  108. /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
  109. ///
  110. /// \param __a
  111. ///    A 128-bit vector of [4 x float] containing the minuend.
  112. /// \param __b
  113. ///    A 128-bit vector of [4 x float] containing the subtrahend.
  114. /// \returns A 128-bit vector of [4 x float] containing the differences between
  115. ///    both operands.
  116. static __inline__ __m128 __DEFAULT_FN_ATTRS
  117. _mm_sub_ps(__m128 __a, __m128 __b)
  118. {
  119.   return (__m128)((__v4sf)__a - (__v4sf)__b);
  120. }
  121.  
  122. /// Multiplies two 32-bit float values in the low-order bits of the
  123. ///    operands.
  124. ///
  125. /// \headerfile <x86intrin.h>
  126. ///
  127. /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
  128. ///
  129. /// \param __a
  130. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  131. ///    The lower 32 bits of this operand are used in the calculation.
  132. /// \param __b
  133. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  134. ///    The lower 32 bits of this operand are used in the calculation.
  135. /// \returns A 128-bit vector of [4 x float] containing the product of the lower
  136. ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
  137. ///    bits of the first source operand.
  138. static __inline__ __m128 __DEFAULT_FN_ATTRS
  139. _mm_mul_ss(__m128 __a, __m128 __b)
  140. {
  141.   __a[0] *= __b[0];
  142.   return __a;
  143. }
  144.  
  145. /// Multiplies two 128-bit vectors of [4 x float] and returns the
  146. ///    results of the multiplication.
  147. ///
  148. /// \headerfile <x86intrin.h>
  149. ///
  150. /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
  151. ///
  152. /// \param __a
  153. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  154. /// \param __b
  155. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  156. /// \returns A 128-bit vector of [4 x float] containing the products of both
  157. ///    operands.
  158. static __inline__ __m128 __DEFAULT_FN_ATTRS
  159. _mm_mul_ps(__m128 __a, __m128 __b)
  160. {
  161.   return (__m128)((__v4sf)__a * (__v4sf)__b);
  162. }
  163.  
  164. /// Divides the value in the low-order 32 bits of the first operand by
  165. ///    the corresponding value in the second operand.
  166. ///
  167. /// \headerfile <x86intrin.h>
  168. ///
  169. /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
  170. ///
  171. /// \param __a
  172. ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
  173. ///    bits of this operand are used in the calculation.
  174. /// \param __b
  175. ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
  176. ///    of this operand are used in the calculation.
  177. /// \returns A 128-bit vector of [4 x float] containing the quotients of the
  178. ///    lower 32 bits of both operands. The upper 96 bits are copied from the
  179. ///    upper 96 bits of the first source operand.
  180. static __inline__ __m128 __DEFAULT_FN_ATTRS
  181. _mm_div_ss(__m128 __a, __m128 __b)
  182. {
  183.   __a[0] /= __b[0];
  184.   return __a;
  185. }
  186.  
  187. /// Divides two 128-bit vectors of [4 x float].
  188. ///
  189. /// \headerfile <x86intrin.h>
  190. ///
  191. /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
  192. ///
  193. /// \param __a
  194. ///    A 128-bit vector of [4 x float] containing the dividend.
  195. /// \param __b
  196. ///    A 128-bit vector of [4 x float] containing the divisor.
  197. /// \returns A 128-bit vector of [4 x float] containing the quotients of both
  198. ///    operands.
  199. static __inline__ __m128 __DEFAULT_FN_ATTRS
  200. _mm_div_ps(__m128 __a, __m128 __b)
  201. {
  202.   return (__m128)((__v4sf)__a / (__v4sf)__b);
  203. }
  204.  
  205. /// Calculates the square root of the value stored in the low-order bits
  206. ///    of a 128-bit vector of [4 x float].
  207. ///
  208. /// \headerfile <x86intrin.h>
  209. ///
  210. /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
  211. ///
  212. /// \param __a
  213. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  214. ///    used in the calculation.
  215. /// \returns A 128-bit vector of [4 x float] containing the square root of the
  216. ///    value in the low-order bits of the operand.
  217. static __inline__ __m128 __DEFAULT_FN_ATTRS
  218. _mm_sqrt_ss(__m128 __a)
  219. {
  220.   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
  221. }
  222.  
  223. /// Calculates the square roots of the values stored in a 128-bit vector
  224. ///    of [4 x float].
  225. ///
  226. /// \headerfile <x86intrin.h>
  227. ///
  228. /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
  229. ///
  230. /// \param __a
  231. ///    A 128-bit vector of [4 x float].
  232. /// \returns A 128-bit vector of [4 x float] containing the square roots of the
  233. ///    values in the operand.
  234. static __inline__ __m128 __DEFAULT_FN_ATTRS
  235. _mm_sqrt_ps(__m128 __a)
  236. {
  237.   return __builtin_ia32_sqrtps((__v4sf)__a);
  238. }
  239.  
  240. /// Calculates the approximate reciprocal of the value stored in the
  241. ///    low-order bits of a 128-bit vector of [4 x float].
  242. ///
  243. /// \headerfile <x86intrin.h>
  244. ///
  245. /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
  246. ///
  247. /// \param __a
  248. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  249. ///    used in the calculation.
  250. /// \returns A 128-bit vector of [4 x float] containing the approximate
  251. ///    reciprocal of the value in the low-order bits of the operand.
  252. static __inline__ __m128 __DEFAULT_FN_ATTRS
  253. _mm_rcp_ss(__m128 __a)
  254. {
  255.   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
  256. }
  257.  
  258. /// Calculates the approximate reciprocals of the values stored in a
  259. ///    128-bit vector of [4 x float].
  260. ///
  261. /// \headerfile <x86intrin.h>
  262. ///
  263. /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
  264. ///
  265. /// \param __a
  266. ///    A 128-bit vector of [4 x float].
  267. /// \returns A 128-bit vector of [4 x float] containing the approximate
  268. ///    reciprocals of the values in the operand.
  269. static __inline__ __m128 __DEFAULT_FN_ATTRS
  270. _mm_rcp_ps(__m128 __a)
  271. {
  272.   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
  273. }
  274.  
  275. /// Calculates the approximate reciprocal of the square root of the value
  276. ///    stored in the low-order bits of a 128-bit vector of [4 x float].
  277. ///
  278. /// \headerfile <x86intrin.h>
  279. ///
  280. /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
  281. ///
  282. /// \param __a
  283. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  284. ///    used in the calculation.
  285. /// \returns A 128-bit vector of [4 x float] containing the approximate
  286. ///    reciprocal of the square root of the value in the low-order bits of the
  287. ///    operand.
  288. static __inline__ __m128 __DEFAULT_FN_ATTRS
  289. _mm_rsqrt_ss(__m128 __a)
  290. {
  291.   return __builtin_ia32_rsqrtss((__v4sf)__a);
  292. }
  293.  
  294. /// Calculates the approximate reciprocals of the square roots of the
  295. ///    values stored in a 128-bit vector of [4 x float].
  296. ///
  297. /// \headerfile <x86intrin.h>
  298. ///
  299. /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
  300. ///
  301. /// \param __a
  302. ///    A 128-bit vector of [4 x float].
  303. /// \returns A 128-bit vector of [4 x float] containing the approximate
  304. ///    reciprocals of the square roots of the values in the operand.
  305. static __inline__ __m128 __DEFAULT_FN_ATTRS
  306. _mm_rsqrt_ps(__m128 __a)
  307. {
  308.   return __builtin_ia32_rsqrtps((__v4sf)__a);
  309. }
  310.  
  311. /// Compares two 32-bit float values in the low-order bits of both
  312. ///    operands and returns the lesser value in the low-order bits of the
  313. ///    vector of [4 x float].
  314. ///
  315. /// \headerfile <x86intrin.h>
  316. ///
  317. /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
  318. ///
  319. /// \param __a
  320. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  321. ///    32 bits of this operand are used in the comparison.
  322. /// \param __b
  323. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  324. ///    32 bits of this operand are used in the comparison.
  325. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  326. ///    minimum value between both operands. The upper 96 bits are copied from
  327. ///    the upper 96 bits of the first source operand.
  328. static __inline__ __m128 __DEFAULT_FN_ATTRS
  329. _mm_min_ss(__m128 __a, __m128 __b)
  330. {
  331.   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
  332. }
  333.  
  334. /// Compares two 128-bit vectors of [4 x float] and returns the lesser
  335. ///    of each pair of values.
  336. ///
  337. /// \headerfile <x86intrin.h>
  338. ///
  339. /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
  340. ///
  341. /// \param __a
  342. ///    A 128-bit vector of [4 x float] containing one of the operands.
  343. /// \param __b
  344. ///    A 128-bit vector of [4 x float] containing one of the operands.
  345. /// \returns A 128-bit vector of [4 x float] containing the minimum values
  346. ///    between both operands.
  347. static __inline__ __m128 __DEFAULT_FN_ATTRS
  348. _mm_min_ps(__m128 __a, __m128 __b)
  349. {
  350.   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
  351. }
  352.  
  353. /// Compares two 32-bit float values in the low-order bits of both
  354. ///    operands and returns the greater value in the low-order bits of a 128-bit
  355. ///    vector of [4 x float].
  356. ///
  357. /// \headerfile <x86intrin.h>
  358. ///
  359. /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
  360. ///
  361. /// \param __a
  362. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  363. ///    32 bits of this operand are used in the comparison.
  364. /// \param __b
  365. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  366. ///    32 bits of this operand are used in the comparison.
  367. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  368. ///    maximum value between both operands. The upper 96 bits are copied from
  369. ///    the upper 96 bits of the first source operand.
  370. static __inline__ __m128 __DEFAULT_FN_ATTRS
  371. _mm_max_ss(__m128 __a, __m128 __b)
  372. {
  373.   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
  374. }
  375.  
  376. /// Compares two 128-bit vectors of [4 x float] and returns the greater
  377. ///    of each pair of values.
  378. ///
  379. /// \headerfile <x86intrin.h>
  380. ///
  381. /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
  382. ///
  383. /// \param __a
  384. ///    A 128-bit vector of [4 x float] containing one of the operands.
  385. /// \param __b
  386. ///    A 128-bit vector of [4 x float] containing one of the operands.
  387. /// \returns A 128-bit vector of [4 x float] containing the maximum values
  388. ///    between both operands.
  389. static __inline__ __m128 __DEFAULT_FN_ATTRS
  390. _mm_max_ps(__m128 __a, __m128 __b)
  391. {
  392.   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
  393. }
  394.  
  395. /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
  396. ///
  397. /// \headerfile <x86intrin.h>
  398. ///
  399. /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
  400. ///
  401. /// \param __a
  402. ///    A 128-bit vector containing one of the source operands.
  403. /// \param __b
  404. ///    A 128-bit vector containing one of the source operands.
  405. /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
  406. ///    values between both operands.
  407. static __inline__ __m128 __DEFAULT_FN_ATTRS
  408. _mm_and_ps(__m128 __a, __m128 __b)
  409. {
  410.   return (__m128)((__v4su)__a & (__v4su)__b);
  411. }
  412.  
  413. /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
  414. ///    the one's complement of the values contained in the first source
  415. ///    operand.
  416. ///
  417. /// \headerfile <x86intrin.h>
  418. ///
  419. /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
  420. ///
  421. /// \param __a
  422. ///    A 128-bit vector of [4 x float] containing the first source operand. The
  423. ///    one's complement of this value is used in the bitwise AND.
  424. /// \param __b
  425. ///    A 128-bit vector of [4 x float] containing the second source operand.
  426. /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
  427. ///    one's complement of the first operand and the values in the second
  428. ///    operand.
  429. static __inline__ __m128 __DEFAULT_FN_ATTRS
  430. _mm_andnot_ps(__m128 __a, __m128 __b)
  431. {
  432.   return (__m128)(~(__v4su)__a & (__v4su)__b);
  433. }
  434.  
  435. /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
  436. ///
  437. /// \headerfile <x86intrin.h>
  438. ///
  439. /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
  440. ///
  441. /// \param __a
  442. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  443. /// \param __b
  444. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  445. /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
  446. ///    values between both operands.
  447. static __inline__ __m128 __DEFAULT_FN_ATTRS
  448. _mm_or_ps(__m128 __a, __m128 __b)
  449. {
  450.   return (__m128)((__v4su)__a | (__v4su)__b);
  451. }
  452.  
  453. /// Performs a bitwise exclusive OR of two 128-bit vectors of
  454. ///    [4 x float].
  455. ///
  456. /// \headerfile <x86intrin.h>
  457. ///
  458. /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
  459. ///
  460. /// \param __a
  461. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  462. /// \param __b
  463. ///    A 128-bit vector of [4 x float] containing one of the source operands.
  464. /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
  465. ///    of the values between both operands.
  466. static __inline__ __m128 __DEFAULT_FN_ATTRS
  467. _mm_xor_ps(__m128 __a, __m128 __b)
  468. {
  469.   return (__m128)((__v4su)__a ^ (__v4su)__b);
  470. }
  471.  
  472. /// Compares two 32-bit float values in the low-order bits of both
  473. ///    operands for equality and returns the result of the comparison in the
  474. ///    low-order bits of a vector [4 x float].
  475. ///
  476. /// \headerfile <x86intrin.h>
  477. ///
  478. /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
  479. ///
  480. /// \param __a
  481. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  482. ///    32 bits of this operand are used in the comparison.
  483. /// \param __b
  484. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  485. ///    32 bits of this operand are used in the comparison.
  486. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  487. ///    in the low-order bits.
  488. static __inline__ __m128 __DEFAULT_FN_ATTRS
  489. _mm_cmpeq_ss(__m128 __a, __m128 __b)
  490. {
  491.   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
  492. }
  493.  
  494. /// Compares each of the corresponding 32-bit float values of the
  495. ///    128-bit vectors of [4 x float] for equality.
  496. ///
  497. /// \headerfile <x86intrin.h>
  498. ///
  499. /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
  500. ///
  501. /// \param __a
  502. ///    A 128-bit vector of [4 x float].
  503. /// \param __b
  504. ///    A 128-bit vector of [4 x float].
  505. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  506. static __inline__ __m128 __DEFAULT_FN_ATTRS
  507. _mm_cmpeq_ps(__m128 __a, __m128 __b)
  508. {
  509.   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
  510. }
  511.  
  512. /// Compares two 32-bit float values in the low-order bits of both
  513. ///    operands to determine if the value in the first operand is less than the
  514. ///    corresponding value in the second operand and returns the result of the
  515. ///    comparison in the low-order bits of a vector of [4 x float].
  516. ///
  517. /// \headerfile <x86intrin.h>
  518. ///
  519. /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
  520. ///
  521. /// \param __a
  522. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  523. ///    32 bits of this operand are used in the comparison.
  524. /// \param __b
  525. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  526. ///    32 bits of this operand are used in the comparison.
  527. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  528. ///    in the low-order bits.
  529. static __inline__ __m128 __DEFAULT_FN_ATTRS
  530. _mm_cmplt_ss(__m128 __a, __m128 __b)
  531. {
  532.   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
  533. }
  534.  
  535. /// Compares each of the corresponding 32-bit float values of the
  536. ///    128-bit vectors of [4 x float] to determine if the values in the first
  537. ///    operand are less than those in the second operand.
  538. ///
  539. /// \headerfile <x86intrin.h>
  540. ///
  541. /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
  542. ///
  543. /// \param __a
  544. ///    A 128-bit vector of [4 x float].
  545. /// \param __b
  546. ///    A 128-bit vector of [4 x float].
  547. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  548. static __inline__ __m128 __DEFAULT_FN_ATTRS
  549. _mm_cmplt_ps(__m128 __a, __m128 __b)
  550. {
  551.   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
  552. }
  553.  
  554. /// Compares two 32-bit float values in the low-order bits of both
  555. ///    operands to determine if the value in the first operand is less than or
  556. ///    equal to the corresponding value in the second operand and returns the
  557. ///    result of the comparison in the low-order bits of a vector of
  558. ///    [4 x float].
  559. ///
  560. /// \headerfile <x86intrin.h>
  561. ///
  562. /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
  563. ///
  564. /// \param __a
  565. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  566. ///    32 bits of this operand are used in the comparison.
  567. /// \param __b
  568. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  569. ///    32 bits of this operand are used in the comparison.
  570. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  571. ///    in the low-order bits.
  572. static __inline__ __m128 __DEFAULT_FN_ATTRS
  573. _mm_cmple_ss(__m128 __a, __m128 __b)
  574. {
  575.   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
  576. }
  577.  
  578. /// Compares each of the corresponding 32-bit float values of the
  579. ///    128-bit vectors of [4 x float] to determine if the values in the first
  580. ///    operand are less than or equal to those in the second operand.
  581. ///
  582. /// \headerfile <x86intrin.h>
  583. ///
  584. /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
  585. ///
  586. /// \param __a
  587. ///    A 128-bit vector of [4 x float].
  588. /// \param __b
  589. ///    A 128-bit vector of [4 x float].
  590. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  591. static __inline__ __m128 __DEFAULT_FN_ATTRS
  592. _mm_cmple_ps(__m128 __a, __m128 __b)
  593. {
  594.   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
  595. }
  596.  
  597. /// Compares two 32-bit float values in the low-order bits of both
  598. ///    operands to determine if the value in the first operand is greater than
  599. ///    the corresponding value in the second operand and returns the result of
  600. ///    the comparison in the low-order bits of a vector of [4 x float].
  601. ///
  602. /// \headerfile <x86intrin.h>
  603. ///
  604. /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
  605. ///
  606. /// \param __a
  607. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  608. ///    32 bits of this operand are used in the comparison.
  609. /// \param __b
  610. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  611. ///    32 bits of this operand are used in the comparison.
  612. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  613. ///    in the low-order bits.
  614. static __inline__ __m128 __DEFAULT_FN_ATTRS
  615. _mm_cmpgt_ss(__m128 __a, __m128 __b)
  616. {
  617.   return (__m128)__builtin_shufflevector((__v4sf)__a,
  618.                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
  619.                                          4, 1, 2, 3);
  620. }
  621.  
  622. /// Compares each of the corresponding 32-bit float values of the
  623. ///    128-bit vectors of [4 x float] to determine if the values in the first
  624. ///    operand are greater than those in the second operand.
  625. ///
  626. /// \headerfile <x86intrin.h>
  627. ///
  628. /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
  629. ///
  630. /// \param __a
  631. ///    A 128-bit vector of [4 x float].
  632. /// \param __b
  633. ///    A 128-bit vector of [4 x float].
  634. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  635. static __inline__ __m128 __DEFAULT_FN_ATTRS
  636. _mm_cmpgt_ps(__m128 __a, __m128 __b)
  637. {
  638.   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
  639. }
  640.  
  641. /// Compares two 32-bit float values in the low-order bits of both
  642. ///    operands to determine if the value in the first operand is greater than
  643. ///    or equal to the corresponding value in the second operand and returns
  644. ///    the result of the comparison in the low-order bits of a vector of
  645. ///    [4 x float].
  646. ///
  647. /// \headerfile <x86intrin.h>
  648. ///
  649. /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
  650. ///
  651. /// \param __a
  652. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  653. ///    32 bits of this operand are used in the comparison.
  654. /// \param __b
  655. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  656. ///    32 bits of this operand are used in the comparison.
  657. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  658. ///    in the low-order bits.
  659. static __inline__ __m128 __DEFAULT_FN_ATTRS
  660. _mm_cmpge_ss(__m128 __a, __m128 __b)
  661. {
  662.   return (__m128)__builtin_shufflevector((__v4sf)__a,
  663.                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
  664.                                          4, 1, 2, 3);
  665. }
  666.  
  667. /// Compares each of the corresponding 32-bit float values of the
  668. ///    128-bit vectors of [4 x float] to determine if the values in the first
  669. ///    operand are greater than or equal to those in the second operand.
  670. ///
  671. /// \headerfile <x86intrin.h>
  672. ///
  673. /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
  674. ///
  675. /// \param __a
  676. ///    A 128-bit vector of [4 x float].
  677. /// \param __b
  678. ///    A 128-bit vector of [4 x float].
  679. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  680. static __inline__ __m128 __DEFAULT_FN_ATTRS
  681. _mm_cmpge_ps(__m128 __a, __m128 __b)
  682. {
  683.   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
  684. }
  685.  
  686. /// Compares two 32-bit float values in the low-order bits of both
  687. ///    operands for inequality and returns the result of the comparison in the
  688. ///    low-order bits of a vector of [4 x float].
  689. ///
  690. /// \headerfile <x86intrin.h>
  691. ///
  692. /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
  693. ///   instructions.
  694. ///
  695. /// \param __a
  696. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  697. ///    32 bits of this operand are used in the comparison.
  698. /// \param __b
  699. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  700. ///    32 bits of this operand are used in the comparison.
  701. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  702. ///    in the low-order bits.
  703. static __inline__ __m128 __DEFAULT_FN_ATTRS
  704. _mm_cmpneq_ss(__m128 __a, __m128 __b)
  705. {
  706.   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
  707. }
  708.  
  709. /// Compares each of the corresponding 32-bit float values of the
  710. ///    128-bit vectors of [4 x float] for inequality.
  711. ///
  712. /// \headerfile <x86intrin.h>
  713. ///
  714. /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
  715. ///   instructions.
  716. ///
  717. /// \param __a
  718. ///    A 128-bit vector of [4 x float].
  719. /// \param __b
  720. ///    A 128-bit vector of [4 x float].
  721. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  722. static __inline__ __m128 __DEFAULT_FN_ATTRS
  723. _mm_cmpneq_ps(__m128 __a, __m128 __b)
  724. {
  725.   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
  726. }
  727.  
  728. /// Compares two 32-bit float values in the low-order bits of both
  729. ///    operands to determine if the value in the first operand is not less than
  730. ///    the corresponding value in the second operand and returns the result of
  731. ///    the comparison in the low-order bits of a vector of [4 x float].
  732. ///
  733. /// \headerfile <x86intrin.h>
  734. ///
  735. /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
  736. ///   instructions.
  737. ///
  738. /// \param __a
  739. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  740. ///    32 bits of this operand are used in the comparison.
  741. /// \param __b
  742. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  743. ///    32 bits of this operand are used in the comparison.
  744. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  745. ///    in the low-order bits.
  746. static __inline__ __m128 __DEFAULT_FN_ATTRS
  747. _mm_cmpnlt_ss(__m128 __a, __m128 __b)
  748. {
  749.   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
  750. }
  751.  
  752. /// Compares each of the corresponding 32-bit float values of the
  753. ///    128-bit vectors of [4 x float] to determine if the values in the first
  754. ///    operand are not less than those in the second operand.
  755. ///
  756. /// \headerfile <x86intrin.h>
  757. ///
  758. /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
  759. ///   instructions.
  760. ///
  761. /// \param __a
  762. ///    A 128-bit vector of [4 x float].
  763. /// \param __b
  764. ///    A 128-bit vector of [4 x float].
  765. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  766. static __inline__ __m128 __DEFAULT_FN_ATTRS
  767. _mm_cmpnlt_ps(__m128 __a, __m128 __b)
  768. {
  769.   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
  770. }
  771.  
  772. /// Compares two 32-bit float values in the low-order bits of both
  773. ///    operands to determine if the value in the first operand is not less than
  774. ///    or equal to the corresponding value in the second operand and returns
  775. ///    the result of the comparison in the low-order bits of a vector of
  776. ///    [4 x float].
  777. ///
  778. /// \headerfile <x86intrin.h>
  779. ///
  780. /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
  781. ///   instructions.
  782. ///
  783. /// \param __a
  784. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  785. ///    32 bits of this operand are used in the comparison.
  786. /// \param __b
  787. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  788. ///    32 bits of this operand are used in the comparison.
  789. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  790. ///    in the low-order bits.
  791. static __inline__ __m128 __DEFAULT_FN_ATTRS
  792. _mm_cmpnle_ss(__m128 __a, __m128 __b)
  793. {
  794.   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
  795. }
  796.  
  797. /// Compares each of the corresponding 32-bit float values of the
  798. ///    128-bit vectors of [4 x float] to determine if the values in the first
  799. ///    operand are not less than or equal to those in the second operand.
  800. ///
  801. /// \headerfile <x86intrin.h>
  802. ///
  803. /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
  804. ///   instructions.
  805. ///
  806. /// \param __a
  807. ///    A 128-bit vector of [4 x float].
  808. /// \param __b
  809. ///    A 128-bit vector of [4 x float].
  810. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  811. static __inline__ __m128 __DEFAULT_FN_ATTRS
  812. _mm_cmpnle_ps(__m128 __a, __m128 __b)
  813. {
  814.   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
  815. }
  816.  
  817. /// Compares two 32-bit float values in the low-order bits of both
  818. ///    operands to determine if the value in the first operand is not greater
  819. ///    than the corresponding value in the second operand and returns the
  820. ///    result of the comparison in the low-order bits of a vector of
  821. ///    [4 x float].
  822. ///
  823. /// \headerfile <x86intrin.h>
  824. ///
  825. /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
  826. ///   instructions.
  827. ///
  828. /// \param __a
  829. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  830. ///    32 bits of this operand are used in the comparison.
  831. /// \param __b
  832. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  833. ///    32 bits of this operand are used in the comparison.
  834. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  835. ///    in the low-order bits.
  836. static __inline__ __m128 __DEFAULT_FN_ATTRS
  837. _mm_cmpngt_ss(__m128 __a, __m128 __b)
  838. {
  839.   return (__m128)__builtin_shufflevector((__v4sf)__a,
  840.                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
  841.                                          4, 1, 2, 3);
  842. }
  843.  
  844. /// Compares each of the corresponding 32-bit float values of the
  845. ///    128-bit vectors of [4 x float] to determine if the values in the first
  846. ///    operand are not greater than those in the second operand.
  847. ///
  848. /// \headerfile <x86intrin.h>
  849. ///
  850. /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
  851. ///   instructions.
  852. ///
  853. /// \param __a
  854. ///    A 128-bit vector of [4 x float].
  855. /// \param __b
  856. ///    A 128-bit vector of [4 x float].
  857. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  858. static __inline__ __m128 __DEFAULT_FN_ATTRS
  859. _mm_cmpngt_ps(__m128 __a, __m128 __b)
  860. {
  861.   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
  862. }
  863.  
  864. /// Compares two 32-bit float values in the low-order bits of both
  865. ///    operands to determine if the value in the first operand is not greater
  866. ///    than or equal to the corresponding value in the second operand and
  867. ///    returns the result of the comparison in the low-order bits of a vector
  868. ///    of [4 x float].
  869. ///
  870. /// \headerfile <x86intrin.h>
  871. ///
  872. /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
  873. ///   instructions.
  874. ///
  875. /// \param __a
  876. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  877. ///    32 bits of this operand are used in the comparison.
  878. /// \param __b
  879. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  880. ///    32 bits of this operand are used in the comparison.
  881. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  882. ///    in the low-order bits.
  883. static __inline__ __m128 __DEFAULT_FN_ATTRS
  884. _mm_cmpnge_ss(__m128 __a, __m128 __b)
  885. {
  886.   return (__m128)__builtin_shufflevector((__v4sf)__a,
  887.                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
  888.                                          4, 1, 2, 3);
  889. }
  890.  
  891. /// Compares each of the corresponding 32-bit float values of the
  892. ///    128-bit vectors of [4 x float] to determine if the values in the first
  893. ///    operand are not greater than or equal to those in the second operand.
  894. ///
  895. /// \headerfile <x86intrin.h>
  896. ///
  897. /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
  898. ///   instructions.
  899. ///
  900. /// \param __a
  901. ///    A 128-bit vector of [4 x float].
  902. /// \param __b
  903. ///    A 128-bit vector of [4 x float].
  904. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  905. static __inline__ __m128 __DEFAULT_FN_ATTRS
  906. _mm_cmpnge_ps(__m128 __a, __m128 __b)
  907. {
  908.   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
  909. }
  910.  
  911. /// Compares two 32-bit float values in the low-order bits of both
  912. ///    operands to determine if the value in the first operand is ordered with
  913. ///    respect to the corresponding value in the second operand and returns the
  914. ///    result of the comparison in the low-order bits of a vector of
  915. ///    [4 x float].
  916. ///
  917. /// \headerfile <x86intrin.h>
  918. ///
  919. /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
  920. ///   instructions.
  921. ///
  922. /// \param __a
  923. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  924. ///    32 bits of this operand are used in the comparison.
  925. /// \param __b
  926. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  927. ///    32 bits of this operand are used in the comparison.
  928. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  929. ///    in the low-order bits.
  930. static __inline__ __m128 __DEFAULT_FN_ATTRS
  931. _mm_cmpord_ss(__m128 __a, __m128 __b)
  932. {
  933.   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
  934. }
  935.  
  936. /// Compares each of the corresponding 32-bit float values of the
  937. ///    128-bit vectors of [4 x float] to determine if the values in the first
  938. ///    operand are ordered with respect to those in the second operand.
  939. ///
  940. /// \headerfile <x86intrin.h>
  941. ///
  942. /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
  943. ///   instructions.
  944. ///
  945. /// \param __a
  946. ///    A 128-bit vector of [4 x float].
  947. /// \param __b
  948. ///    A 128-bit vector of [4 x float].
  949. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  950. static __inline__ __m128 __DEFAULT_FN_ATTRS
  951. _mm_cmpord_ps(__m128 __a, __m128 __b)
  952. {
  953.   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
  954. }
  955.  
  956. /// Compares two 32-bit float values in the low-order bits of both
  957. ///    operands to determine if the value in the first operand is unordered
  958. ///    with respect to the corresponding value in the second operand and
  959. ///    returns the result of the comparison in the low-order bits of a vector
  960. ///    of [4 x float].
  961. ///
  962. /// \headerfile <x86intrin.h>
  963. ///
  964. /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
  965. ///   instructions.
  966. ///
  967. /// \param __a
  968. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  969. ///    32 bits of this operand are used in the comparison.
  970. /// \param __b
  971. ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
  972. ///    32 bits of this operand are used in the comparison.
  973. /// \returns A 128-bit vector of [4 x float] containing the comparison results
  974. ///    in the low-order bits.
  975. static __inline__ __m128 __DEFAULT_FN_ATTRS
  976. _mm_cmpunord_ss(__m128 __a, __m128 __b)
  977. {
  978.   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
  979. }
  980.  
  981. /// Compares each of the corresponding 32-bit float values of the
  982. ///    128-bit vectors of [4 x float] to determine if the values in the first
  983. ///    operand are unordered with respect to those in the second operand.
  984. ///
  985. /// \headerfile <x86intrin.h>
  986. ///
  987. /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
  988. ///   instructions.
  989. ///
  990. /// \param __a
  991. ///    A 128-bit vector of [4 x float].
  992. /// \param __b
  993. ///    A 128-bit vector of [4 x float].
  994. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  995. static __inline__ __m128 __DEFAULT_FN_ATTRS
  996. _mm_cmpunord_ps(__m128 __a, __m128 __b)
  997. {
  998.   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
  999. }
  1000.  
  1001. /// Compares two 32-bit float values in the low-order bits of both
  1002. ///    operands for equality and returns the result of the comparison.
  1003. ///
  1004. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1005. ///
  1006. /// \headerfile <x86intrin.h>
  1007. ///
  1008. /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
  1009. ///   instructions.
  1010. ///
  1011. /// \param __a
  1012. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1013. ///    used in the comparison.
  1014. /// \param __b
  1015. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1016. ///    used in the comparison.
  1017. /// \returns An integer containing the comparison results. If either of the
  1018. ///    two lower 32-bit values is NaN, 0 is returned.
  1019. static __inline__ int __DEFAULT_FN_ATTRS
  1020. _mm_comieq_ss(__m128 __a, __m128 __b)
  1021. {
  1022.   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
  1023. }
  1024.  
  1025. /// Compares two 32-bit float values in the low-order bits of both
  1026. ///    operands to determine if the first operand is less than the second
  1027. ///    operand and returns the result of the comparison.
  1028. ///
  1029. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1030. ///
  1031. /// \headerfile <x86intrin.h>
  1032. ///
  1033. /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
  1034. ///   instructions.
  1035. ///
  1036. /// \param __a
  1037. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1038. ///    used in the comparison.
  1039. /// \param __b
  1040. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1041. ///    used in the comparison.
  1042. /// \returns An integer containing the comparison results. If either of the two
  1043. ///     lower 32-bit values is NaN, 0 is returned.
  1044. static __inline__ int __DEFAULT_FN_ATTRS
  1045. _mm_comilt_ss(__m128 __a, __m128 __b)
  1046. {
  1047.   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
  1048. }
  1049.  
  1050. /// Compares two 32-bit float values in the low-order bits of both
  1051. ///    operands to determine if the first operand is less than or equal to the
  1052. ///    second operand and returns the result of the comparison.
  1053. ///
  1054. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1055. ///
  1056. /// \headerfile <x86intrin.h>
  1057. ///
  1058. /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
  1059. ///
  1060. /// \param __a
  1061. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1062. ///    used in the comparison.
  1063. /// \param __b
  1064. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1065. ///    used in the comparison.
  1066. /// \returns An integer containing the comparison results. If either of the two
  1067. ///     lower 32-bit values is NaN, 0 is returned.
  1068. static __inline__ int __DEFAULT_FN_ATTRS
  1069. _mm_comile_ss(__m128 __a, __m128 __b)
  1070. {
  1071.   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
  1072. }
  1073.  
  1074. /// Compares two 32-bit float values in the low-order bits of both
  1075. ///    operands to determine if the first operand is greater than the second
  1076. ///    operand and returns the result of the comparison.
  1077. ///
  1078. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1079. ///
  1080. /// \headerfile <x86intrin.h>
  1081. ///
  1082. /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
  1083. ///
  1084. /// \param __a
  1085. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1086. ///    used in the comparison.
  1087. /// \param __b
  1088. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1089. ///    used in the comparison.
  1090. /// \returns An integer containing the comparison results. If either of the
  1091. ///     two lower 32-bit values is NaN, 0 is returned.
  1092. static __inline__ int __DEFAULT_FN_ATTRS
  1093. _mm_comigt_ss(__m128 __a, __m128 __b)
  1094. {
  1095.   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
  1096. }
  1097.  
  1098. /// Compares two 32-bit float values in the low-order bits of both
  1099. ///    operands to determine if the first operand is greater than or equal to
  1100. ///    the second operand and returns the result of the comparison.
  1101. ///
  1102. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1103. ///
  1104. /// \headerfile <x86intrin.h>
  1105. ///
  1106. /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
  1107. ///
  1108. /// \param __a
  1109. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1110. ///    used in the comparison.
  1111. /// \param __b
  1112. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1113. ///    used in the comparison.
  1114. /// \returns An integer containing the comparison results. If either of the two
  1115. ///    lower 32-bit values is NaN, 0 is returned.
  1116. static __inline__ int __DEFAULT_FN_ATTRS
  1117. _mm_comige_ss(__m128 __a, __m128 __b)
  1118. {
  1119.   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
  1120. }
  1121.  
  1122. /// Compares two 32-bit float values in the low-order bits of both
  1123. ///    operands to determine if the first operand is not equal to the second
  1124. ///    operand and returns the result of the comparison.
  1125. ///
  1126. ///    If either of the two lower 32-bit values is NaN, 1 is returned.
  1127. ///
  1128. /// \headerfile <x86intrin.h>
  1129. ///
  1130. /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
  1131. ///
  1132. /// \param __a
  1133. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1134. ///    used in the comparison.
  1135. /// \param __b
  1136. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1137. ///    used in the comparison.
  1138. /// \returns An integer containing the comparison results. If either of the
  1139. ///     two lower 32-bit values is NaN, 1 is returned.
  1140. static __inline__ int __DEFAULT_FN_ATTRS
  1141. _mm_comineq_ss(__m128 __a, __m128 __b)
  1142. {
  1143.   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
  1144. }
  1145.  
  1146. /// Performs an unordered comparison of two 32-bit float values using
  1147. ///    the low-order bits of both operands to determine equality and returns
  1148. ///    the result of the comparison.
  1149. ///
  1150. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1151. ///
  1152. /// \headerfile <x86intrin.h>
  1153. ///
  1154. /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
  1155. ///
  1156. /// \param __a
  1157. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1158. ///    used in the comparison.
  1159. /// \param __b
  1160. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1161. ///    used in the comparison.
  1162. /// \returns An integer containing the comparison results. If either of the two
  1163. ///     lower 32-bit values is NaN, 0 is returned.
  1164. static __inline__ int __DEFAULT_FN_ATTRS
  1165. _mm_ucomieq_ss(__m128 __a, __m128 __b)
  1166. {
  1167.   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
  1168. }
  1169.  
  1170. /// Performs an unordered comparison of two 32-bit float values using
  1171. ///    the low-order bits of both operands to determine if the first operand is
  1172. ///    less than the second operand and returns the result of the comparison.
  1173. ///
  1174. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1175. ///
  1176. /// \headerfile <x86intrin.h>
  1177. ///
  1178. /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
  1179. ///
  1180. /// \param __a
  1181. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1182. ///    used in the comparison.
  1183. /// \param __b
  1184. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1185. ///    used in the comparison.
  1186. /// \returns An integer containing the comparison results. If either of the two
  1187. ///    lower 32-bit values is NaN, 0 is returned.
  1188. static __inline__ int __DEFAULT_FN_ATTRS
  1189. _mm_ucomilt_ss(__m128 __a, __m128 __b)
  1190. {
  1191.   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
  1192. }
  1193.  
  1194. /// Performs an unordered comparison of two 32-bit float values using
  1195. ///    the low-order bits of both operands to determine if the first operand is
  1196. ///    less than or equal to the second operand and returns the result of the
  1197. ///    comparison.
  1198. ///
  1199. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1200. ///
  1201. /// \headerfile <x86intrin.h>
  1202. ///
  1203. /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
  1204. ///
  1205. /// \param __a
  1206. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1207. ///    used in the comparison.
  1208. /// \param __b
  1209. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1210. ///    used in the comparison.
  1211. /// \returns An integer containing the comparison results. If either of the two
  1212. ///     lower 32-bit values is NaN, 0 is returned.
  1213. static __inline__ int __DEFAULT_FN_ATTRS
  1214. _mm_ucomile_ss(__m128 __a, __m128 __b)
  1215. {
  1216.   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
  1217. }
  1218.  
  1219. /// Performs an unordered comparison of two 32-bit float values using
  1220. ///    the low-order bits of both operands to determine if the first operand is
  1221. ///    greater than the second operand and returns the result of the
  1222. ///    comparison.
  1223. ///
  1224. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1225. ///
  1226. /// \headerfile <x86intrin.h>
  1227. ///
  1228. /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
  1229. ///
  1230. /// \param __a
  1231. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1232. ///    used in the comparison.
  1233. /// \param __b
  1234. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1235. ///    used in the comparison.
  1236. /// \returns An integer containing the comparison results. If either of the two
  1237. ///     lower 32-bit values is NaN, 0 is returned.
  1238. static __inline__ int __DEFAULT_FN_ATTRS
  1239. _mm_ucomigt_ss(__m128 __a, __m128 __b)
  1240. {
  1241.   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
  1242. }
  1243.  
  1244. /// Performs an unordered comparison of two 32-bit float values using
  1245. ///    the low-order bits of both operands to determine if the first operand is
  1246. ///    greater than or equal to the second operand and returns the result of
  1247. ///    the comparison.
  1248. ///
  1249. ///    If either of the two lower 32-bit values is NaN, 0 is returned.
  1250. ///
  1251. /// \headerfile <x86intrin.h>
  1252. ///
  1253. /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
  1254. ///
  1255. /// \param __a
  1256. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1257. ///    used in the comparison.
  1258. /// \param __b
  1259. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1260. ///    used in the comparison.
  1261. /// \returns An integer containing the comparison results. If either of the two
  1262. ///     lower 32-bit values is NaN, 0 is returned.
  1263. static __inline__ int __DEFAULT_FN_ATTRS
  1264. _mm_ucomige_ss(__m128 __a, __m128 __b)
  1265. {
  1266.   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
  1267. }
  1268.  
  1269. /// Performs an unordered comparison of two 32-bit float values using
  1270. ///    the low-order bits of both operands to determine inequality and returns
  1271. ///    the result of the comparison.
  1272. ///
  1273. ///    If either of the two lower 32-bit values is NaN, 1 is returned.
  1274. ///
  1275. /// \headerfile <x86intrin.h>
  1276. ///
  1277. /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
  1278. ///
  1279. /// \param __a
  1280. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1281. ///    used in the comparison.
  1282. /// \param __b
  1283. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1284. ///    used in the comparison.
  1285. /// \returns An integer containing the comparison results. If either of the two
  1286. ///    lower 32-bit values is NaN, 1 is returned.
  1287. static __inline__ int __DEFAULT_FN_ATTRS
  1288. _mm_ucomineq_ss(__m128 __a, __m128 __b)
  1289. {
  1290.   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
  1291. }
  1292.  
  1293. /// Converts a float value contained in the lower 32 bits of a vector of
  1294. ///    [4 x float] into a 32-bit integer.
  1295. ///
  1296. /// \headerfile <x86intrin.h>
  1297. ///
  1298. /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
  1299. ///   instructions.
  1300. ///
  1301. /// \param __a
  1302. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1303. ///    used in the conversion.
  1304. /// \returns A 32-bit integer containing the converted value.
  1305. static __inline__ int __DEFAULT_FN_ATTRS
  1306. _mm_cvtss_si32(__m128 __a)
  1307. {
  1308.   return __builtin_ia32_cvtss2si((__v4sf)__a);
  1309. }
  1310.  
  1311. /// Converts a float value contained in the lower 32 bits of a vector of
  1312. ///    [4 x float] into a 32-bit integer.
  1313. ///
  1314. /// \headerfile <x86intrin.h>
  1315. ///
  1316. /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
  1317. ///   instructions.
  1318. ///
  1319. /// \param __a
  1320. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1321. ///    used in the conversion.
  1322. /// \returns A 32-bit integer containing the converted value.
  1323. static __inline__ int __DEFAULT_FN_ATTRS
  1324. _mm_cvt_ss2si(__m128 __a)
  1325. {
  1326.   return _mm_cvtss_si32(__a);
  1327. }
  1328.  
  1329. #ifdef __x86_64__
  1330.  
  1331. /// Converts a float value contained in the lower 32 bits of a vector of
  1332. ///    [4 x float] into a 64-bit integer.
  1333. ///
  1334. /// \headerfile <x86intrin.h>
  1335. ///
  1336. /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
  1337. ///   instructions.
  1338. ///
  1339. /// \param __a
  1340. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1341. ///    used in the conversion.
  1342. /// \returns A 64-bit integer containing the converted value.
  1343. static __inline__ long long __DEFAULT_FN_ATTRS
  1344. _mm_cvtss_si64(__m128 __a)
  1345. {
  1346.   return __builtin_ia32_cvtss2si64((__v4sf)__a);
  1347. }
  1348.  
  1349. #endif
  1350.  
  1351. /// Converts two low-order float values in a 128-bit vector of
  1352. ///    [4 x float] into a 64-bit vector of [2 x i32].
  1353. ///
  1354. /// \headerfile <x86intrin.h>
  1355. ///
  1356. /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
  1357. ///
  1358. /// \param __a
  1359. ///    A 128-bit vector of [4 x float].
  1360. /// \returns A 64-bit integer vector containing the converted values.
  1361. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  1362. _mm_cvtps_pi32(__m128 __a)
  1363. {
  1364.   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
  1365. }
  1366.  
  1367. /// Converts two low-order float values in a 128-bit vector of
  1368. ///    [4 x float] into a 64-bit vector of [2 x i32].
  1369. ///
  1370. /// \headerfile <x86intrin.h>
  1371. ///
  1372. /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
  1373. ///
  1374. /// \param __a
  1375. ///    A 128-bit vector of [4 x float].
  1376. /// \returns A 64-bit integer vector containing the converted values.
  1377. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  1378. _mm_cvt_ps2pi(__m128 __a)
  1379. {
  1380.   return _mm_cvtps_pi32(__a);
  1381. }
  1382.  
  1383. /// Converts a float value contained in the lower 32 bits of a vector of
  1384. ///    [4 x float] into a 32-bit integer, truncating the result when it is
  1385. ///    inexact.
  1386. ///
  1387. /// \headerfile <x86intrin.h>
  1388. ///
  1389. /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
  1390. ///   instructions.
  1391. ///
  1392. /// \param __a
  1393. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1394. ///    used in the conversion.
  1395. /// \returns A 32-bit integer containing the converted value.
  1396. static __inline__ int __DEFAULT_FN_ATTRS
  1397. _mm_cvttss_si32(__m128 __a)
  1398. {
  1399.   return __builtin_ia32_cvttss2si((__v4sf)__a);
  1400. }
  1401.  
  1402. /// Converts a float value contained in the lower 32 bits of a vector of
  1403. ///    [4 x float] into a 32-bit integer, truncating the result when it is
  1404. ///    inexact.
  1405. ///
  1406. /// \headerfile <x86intrin.h>
  1407. ///
  1408. /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
  1409. ///   instructions.
  1410. ///
  1411. /// \param __a
  1412. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1413. ///    used in the conversion.
  1414. /// \returns A 32-bit integer containing the converted value.
  1415. static __inline__ int __DEFAULT_FN_ATTRS
  1416. _mm_cvtt_ss2si(__m128 __a)
  1417. {
  1418.   return _mm_cvttss_si32(__a);
  1419. }
  1420.  
  1421. #ifdef __x86_64__
  1422. /// Converts a float value contained in the lower 32 bits of a vector of
  1423. ///    [4 x float] into a 64-bit integer, truncating the result when it is
  1424. ///    inexact.
  1425. ///
  1426. /// \headerfile <x86intrin.h>
  1427. ///
  1428. /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
  1429. ///   instructions.
  1430. ///
  1431. /// \param __a
  1432. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1433. ///    used in the conversion.
  1434. /// \returns A 64-bit integer containing the converted value.
  1435. static __inline__ long long __DEFAULT_FN_ATTRS
  1436. _mm_cvttss_si64(__m128 __a)
  1437. {
  1438.   return __builtin_ia32_cvttss2si64((__v4sf)__a);
  1439. }
  1440. #endif
  1441.  
  1442. /// Converts two low-order float values in a 128-bit vector of
  1443. ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
  1444. ///    when it is inexact.
  1445. ///
  1446. /// \headerfile <x86intrin.h>
  1447. ///
  1448. /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
  1449. ///   instructions.
  1450. ///
  1451. /// \param __a
  1452. ///    A 128-bit vector of [4 x float].
  1453. /// \returns A 64-bit integer vector containing the converted values.
  1454. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  1455. _mm_cvttps_pi32(__m128 __a)
  1456. {
  1457.   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
  1458. }
  1459.  
  1460. /// Converts two low-order float values in a 128-bit vector of [4 x
  1461. ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
  1462. ///    is inexact.
  1463. ///
  1464. /// \headerfile <x86intrin.h>
  1465. ///
  1466. /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
  1467. ///
  1468. /// \param __a
  1469. ///    A 128-bit vector of [4 x float].
  1470. /// \returns A 64-bit integer vector containing the converted values.
  1471. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  1472. _mm_cvtt_ps2pi(__m128 __a)
  1473. {
  1474.   return _mm_cvttps_pi32(__a);
  1475. }
  1476.  
  1477. /// Converts a 32-bit signed integer value into a floating point value
  1478. ///    and writes it to the lower 32 bits of the destination. The remaining
  1479. ///    higher order elements of the destination vector are copied from the
  1480. ///    corresponding elements in the first operand.
  1481. ///
  1482. /// \headerfile <x86intrin.h>
  1483. ///
  1484. /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
  1485. ///
  1486. /// \param __a
  1487. ///    A 128-bit vector of [4 x float].
  1488. /// \param __b
  1489. ///    A 32-bit signed integer operand containing the value to be converted.
  1490. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  1491. ///    converted value of the second operand. The upper 96 bits are copied from
  1492. ///    the upper 96 bits of the first operand.
  1493. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1494. _mm_cvtsi32_ss(__m128 __a, int __b)
  1495. {
  1496.   __a[0] = __b;
  1497.   return __a;
  1498. }
  1499.  
  1500. /// Converts a 32-bit signed integer value into a floating point value
  1501. ///    and writes it to the lower 32 bits of the destination. The remaining
  1502. ///    higher order elements of the destination are copied from the
  1503. ///    corresponding elements in the first operand.
  1504. ///
  1505. /// \headerfile <x86intrin.h>
  1506. ///
  1507. /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
  1508. ///
  1509. /// \param __a
  1510. ///    A 128-bit vector of [4 x float].
  1511. /// \param __b
  1512. ///    A 32-bit signed integer operand containing the value to be converted.
  1513. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  1514. ///    converted value of the second operand. The upper 96 bits are copied from
  1515. ///    the upper 96 bits of the first operand.
  1516. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1517. _mm_cvt_si2ss(__m128 __a, int __b)
  1518. {
  1519.   return _mm_cvtsi32_ss(__a, __b);
  1520. }
  1521.  
  1522. #ifdef __x86_64__
  1523.  
  1524. /// Converts a 64-bit signed integer value into a floating point value
  1525. ///    and writes it to the lower 32 bits of the destination. The remaining
  1526. ///    higher order elements of the destination are copied from the
  1527. ///    corresponding elements in the first operand.
  1528. ///
  1529. /// \headerfile <x86intrin.h>
  1530. ///
  1531. /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
  1532. ///
  1533. /// \param __a
  1534. ///    A 128-bit vector of [4 x float].
  1535. /// \param __b
  1536. ///    A 64-bit signed integer operand containing the value to be converted.
  1537. /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
  1538. ///    converted value of the second operand. The upper 96 bits are copied from
  1539. ///    the upper 96 bits of the first operand.
  1540. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1541. _mm_cvtsi64_ss(__m128 __a, long long __b)
  1542. {
  1543.   __a[0] = __b;
  1544.   return __a;
  1545. }
  1546.  
  1547. #endif
  1548.  
  1549. /// Converts two elements of a 64-bit vector of [2 x i32] into two
  1550. ///    floating point values and writes them to the lower 64-bits of the
  1551. ///    destination. The remaining higher order elements of the destination are
  1552. ///    copied from the corresponding elements in the first operand.
  1553. ///
  1554. /// \headerfile <x86intrin.h>
  1555. ///
  1556. /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
  1557. ///
  1558. /// \param __a
  1559. ///    A 128-bit vector of [4 x float].
  1560. /// \param __b
  1561. ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
  1562. ///    and written to the corresponding low-order elements in the destination.
  1563. /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
  1564. ///    converted value of the second operand. The upper 64 bits are copied from
  1565. ///    the upper 64 bits of the first operand.
  1566. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  1567. _mm_cvtpi32_ps(__m128 __a, __m64 __b)
  1568. {
  1569.   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
  1570. }
  1571.  
  1572. /// Converts two elements of a 64-bit vector of [2 x i32] into two
  1573. ///    floating point values and writes them to the lower 64-bits of the
  1574. ///    destination. The remaining higher order elements of the destination are
  1575. ///    copied from the corresponding elements in the first operand.
  1576. ///
  1577. /// \headerfile <x86intrin.h>
  1578. ///
  1579. /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
  1580. ///
  1581. /// \param __a
  1582. ///    A 128-bit vector of [4 x float].
  1583. /// \param __b
  1584. ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
  1585. ///    and written to the corresponding low-order elements in the destination.
  1586. /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
  1587. ///    converted value from the second operand. The upper 64 bits are copied
  1588. ///    from the upper 64 bits of the first operand.
  1589. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  1590. _mm_cvt_pi2ps(__m128 __a, __m64 __b)
  1591. {
  1592.   return _mm_cvtpi32_ps(__a, __b);
  1593. }
  1594.  
  1595. /// Extracts a float value contained in the lower 32 bits of a vector of
  1596. ///    [4 x float].
  1597. ///
  1598. /// \headerfile <x86intrin.h>
  1599. ///
  1600. /// This intrinsic has no corresponding instruction.
  1601. ///
  1602. /// \param __a
  1603. ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
  1604. ///    used in the extraction.
  1605. /// \returns A 32-bit float containing the extracted value.
  1606. static __inline__ float __DEFAULT_FN_ATTRS
  1607. _mm_cvtss_f32(__m128 __a)
  1608. {
  1609.   return __a[0];
  1610. }
  1611.  
  1612. /// Loads two packed float values from the address \a __p into the
  1613. ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
  1614. ///     are copied from the low-order bits of the first operand.
  1615. ///
  1616. /// \headerfile <x86intrin.h>
  1617. ///
  1618. /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
  1619. ///
  1620. /// \param __a
  1621. ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
  1622. ///    of the destination.
  1623. /// \param __p
  1624. ///    A pointer to two packed float values. Bits [63:0] are written to bits
  1625. ///    [127:64] of the destination.
  1626. /// \returns A 128-bit vector of [4 x float] containing the moved values.
  1627. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1628. _mm_loadh_pi(__m128 __a, const __m64 *__p)
  1629. {
  1630.   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
  1631.   struct __mm_loadh_pi_struct {
  1632.     __mm_loadh_pi_v2f32 __u;
  1633.   } __attribute__((__packed__, __may_alias__));
  1634.   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
  1635.   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
  1636.   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
  1637. }
  1638.  
  1639. /// Loads two packed float values from the address \a __p into the
  1640. ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
  1641. ///    are copied from the high-order bits of the first operand.
  1642. ///
  1643. /// \headerfile <x86intrin.h>
  1644. ///
  1645. /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
  1646. ///
  1647. /// \param __a
  1648. ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
  1649. ///    [127:64] of the destination.
  1650. /// \param __p
  1651. ///    A pointer to two packed float values. Bits [63:0] are written to bits
  1652. ///    [63:0] of the destination.
  1653. /// \returns A 128-bit vector of [4 x float] containing the moved values.
  1654. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1655. _mm_loadl_pi(__m128 __a, const __m64 *__p)
  1656. {
  1657.   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
  1658.   struct __mm_loadl_pi_struct {
  1659.     __mm_loadl_pi_v2f32 __u;
  1660.   } __attribute__((__packed__, __may_alias__));
  1661.   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
  1662.   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
  1663.   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
  1664. }
  1665.  
  1666. /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
  1667. ///    32 bits of the vector are initialized with the single-precision
  1668. ///    floating-point value loaded from a specified memory location. The upper
  1669. ///    96 bits are set to zero.
  1670. ///
  1671. /// \headerfile <x86intrin.h>
  1672. ///
  1673. /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
  1674. ///
  1675. /// \param __p
  1676. ///    A pointer to a 32-bit memory location containing a single-precision
  1677. ///    floating-point value.
  1678. /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
  1679. ///    lower 32 bits contain the value loaded from the memory location. The
  1680. ///    upper 96 bits are set to zero.
  1681. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1682. _mm_load_ss(const float *__p)
  1683. {
  1684.   struct __mm_load_ss_struct {
  1685.     float __u;
  1686.   } __attribute__((__packed__, __may_alias__));
  1687.   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
  1688.   return __extension__ (__m128){ __u, 0, 0, 0 };
  1689. }
  1690.  
  1691. /// Loads a 32-bit float value and duplicates it to all four vector
  1692. ///    elements of a 128-bit vector of [4 x float].
  1693. ///
  1694. /// \headerfile <x86intrin.h>
  1695. ///
  1696. /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
  1697. ///    instruction.
  1698. ///
  1699. /// \param __p
  1700. ///    A pointer to a float value to be loaded and duplicated.
  1701. /// \returns A 128-bit vector of [4 x float] containing the loaded and
  1702. ///    duplicated values.
  1703. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1704. _mm_load1_ps(const float *__p)
  1705. {
  1706.   struct __mm_load1_ps_struct {
  1707.     float __u;
  1708.   } __attribute__((__packed__, __may_alias__));
  1709.   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
  1710.   return __extension__ (__m128){ __u, __u, __u, __u };
  1711. }
  1712.  
  1713. #define        _mm_load_ps1(p) _mm_load1_ps(p)
  1714.  
  1715. /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
  1716. ///    memory location.
  1717. ///
  1718. /// \headerfile <x86intrin.h>
  1719. ///
  1720. /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
  1721. ///
  1722. /// \param __p
  1723. ///    A pointer to a 128-bit memory location. The address of the memory
  1724. ///    location has to be 128-bit aligned.
  1725. /// \returns A 128-bit vector of [4 x float] containing the loaded values.
  1726. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1727. _mm_load_ps(const float *__p)
  1728. {
  1729.   return *(const __m128*)__p;
  1730. }
  1731.  
  1732. /// Loads a 128-bit floating-point vector of [4 x float] from an
  1733. ///    unaligned memory location.
  1734. ///
  1735. /// \headerfile <x86intrin.h>
  1736. ///
  1737. /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
  1738. ///
  1739. /// \param __p
  1740. ///    A pointer to a 128-bit memory location. The address of the memory
  1741. ///    location does not have to be aligned.
  1742. /// \returns A 128-bit vector of [4 x float] containing the loaded values.
  1743. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1744. _mm_loadu_ps(const float *__p)
  1745. {
  1746.   struct __loadu_ps {
  1747.     __m128_u __v;
  1748.   } __attribute__((__packed__, __may_alias__));
  1749.   return ((const struct __loadu_ps*)__p)->__v;
  1750. }
  1751.  
  1752. /// Loads four packed float values, in reverse order, from an aligned
  1753. ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
  1754. ///
  1755. /// \headerfile <x86intrin.h>
  1756. ///
  1757. /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
  1758. ///    instruction.
  1759. ///
  1760. /// \param __p
  1761. ///    A pointer to a 128-bit memory location. The address of the memory
  1762. ///    location has to be 128-bit aligned.
  1763. /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
  1764. ///    in reverse order.
  1765. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1766. _mm_loadr_ps(const float *__p)
  1767. {
  1768.   __m128 __a = _mm_load_ps(__p);
  1769.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
  1770. }
  1771.  
  1772. /// Create a 128-bit vector of [4 x float] with undefined values.
  1773. ///
  1774. /// \headerfile <x86intrin.h>
  1775. ///
  1776. /// This intrinsic has no corresponding instruction.
  1777. ///
  1778. /// \returns A 128-bit vector of [4 x float] containing undefined values.
  1779. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1780. _mm_undefined_ps(void)
  1781. {
  1782.   return (__m128)__builtin_ia32_undef128();
  1783. }
  1784.  
  1785. /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
  1786. ///    32 bits of the vector are initialized with the specified single-precision
  1787. ///    floating-point value. The upper 96 bits are set to zero.
  1788. ///
  1789. /// \headerfile <x86intrin.h>
  1790. ///
  1791. /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
  1792. ///
  1793. /// \param __w
  1794. ///    A single-precision floating-point value used to initialize the lower 32
  1795. ///    bits of the result.
  1796. /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
  1797. ///    lower 32 bits contain the value provided in the source operand. The
  1798. ///    upper 96 bits are set to zero.
  1799. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1800. _mm_set_ss(float __w)
  1801. {
  1802.   return __extension__ (__m128){ __w, 0, 0, 0 };
  1803. }
  1804.  
  1805. /// Constructs a 128-bit floating-point vector of [4 x float], with each
  1806. ///    of the four single-precision floating-point vector elements set to the
  1807. ///    specified single-precision floating-point value.
  1808. ///
  1809. /// \headerfile <x86intrin.h>
  1810. ///
  1811. /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
  1812. ///
  1813. /// \param __w
  1814. ///    A single-precision floating-point value used to initialize each vector
  1815. ///    element of the result.
  1816. /// \returns An initialized 128-bit floating-point vector of [4 x float].
  1817. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1818. _mm_set1_ps(float __w)
  1819. {
  1820.   return __extension__ (__m128){ __w, __w, __w, __w };
  1821. }
  1822.  
  1823. /* Microsoft specific. */
  1824. /// Constructs a 128-bit floating-point vector of [4 x float], with each
  1825. ///    of the four single-precision floating-point vector elements set to the
  1826. ///    specified single-precision floating-point value.
  1827. ///
  1828. /// \headerfile <x86intrin.h>
  1829. ///
  1830. /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
  1831. ///
  1832. /// \param __w
  1833. ///    A single-precision floating-point value used to initialize each vector
  1834. ///    element of the result.
  1835. /// \returns An initialized 128-bit floating-point vector of [4 x float].
  1836. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1837. _mm_set_ps1(float __w)
  1838. {
  1839.     return _mm_set1_ps(__w);
  1840. }
  1841.  
  1842. /// Constructs a 128-bit floating-point vector of [4 x float]
  1843. ///    initialized with the specified single-precision floating-point values.
  1844. ///
  1845. /// \headerfile <x86intrin.h>
  1846. ///
  1847. /// This intrinsic is a utility function and does not correspond to a specific
  1848. ///    instruction.
  1849. ///
  1850. /// \param __z
  1851. ///    A single-precision floating-point value used to initialize bits [127:96]
  1852. ///    of the result.
  1853. /// \param __y
  1854. ///    A single-precision floating-point value used to initialize bits [95:64]
  1855. ///    of the result.
  1856. /// \param __x
  1857. ///    A single-precision floating-point value used to initialize bits [63:32]
  1858. ///    of the result.
  1859. /// \param __w
  1860. ///    A single-precision floating-point value used to initialize bits [31:0]
  1861. ///    of the result.
  1862. /// \returns An initialized 128-bit floating-point vector of [4 x float].
  1863. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1864. _mm_set_ps(float __z, float __y, float __x, float __w)
  1865. {
  1866.   return __extension__ (__m128){ __w, __x, __y, __z };
  1867. }
  1868.  
  1869. /// Constructs a 128-bit floating-point vector of [4 x float],
  1870. ///    initialized in reverse order with the specified 32-bit single-precision
  1871. ///    float-point values.
  1872. ///
  1873. /// \headerfile <x86intrin.h>
  1874. ///
  1875. /// This intrinsic is a utility function and does not correspond to a specific
  1876. ///    instruction.
  1877. ///
  1878. /// \param __z
  1879. ///    A single-precision floating-point value used to initialize bits [31:0]
  1880. ///    of the result.
  1881. /// \param __y
  1882. ///    A single-precision floating-point value used to initialize bits [63:32]
  1883. ///    of the result.
  1884. /// \param __x
  1885. ///    A single-precision floating-point value used to initialize bits [95:64]
  1886. ///    of the result.
  1887. /// \param __w
  1888. ///    A single-precision floating-point value used to initialize bits [127:96]
  1889. ///    of the result.
  1890. /// \returns An initialized 128-bit floating-point vector of [4 x float].
  1891. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1892. _mm_setr_ps(float __z, float __y, float __x, float __w)
  1893. {
  1894.   return __extension__ (__m128){ __z, __y, __x, __w };
  1895. }
  1896.  
  1897. /// Constructs a 128-bit floating-point vector of [4 x float] initialized
  1898. ///    to zero.
  1899. ///
  1900. /// \headerfile <x86intrin.h>
  1901. ///
  1902. /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
  1903. ///
  1904. /// \returns An initialized 128-bit floating-point vector of [4 x float] with
  1905. ///    all elements set to zero.
  1906. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1907. _mm_setzero_ps(void)
  1908. {
  1909.   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
  1910. }
  1911.  
  1912. /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
  1913. ///    memory location.
  1914. ///
  1915. /// \headerfile <x86intrin.h>
  1916. ///
  1917. /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
  1918. ///
  1919. /// \param __p
  1920. ///    A pointer to a 64-bit memory location.
  1921. /// \param __a
  1922. ///    A 128-bit vector of [4 x float] containing the values to be stored.
  1923. static __inline__ void __DEFAULT_FN_ATTRS
  1924. _mm_storeh_pi(__m64 *__p, __m128 __a)
  1925. {
  1926.   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
  1927.   struct __mm_storeh_pi_struct {
  1928.     __mm_storeh_pi_v2f32 __u;
  1929.   } __attribute__((__packed__, __may_alias__));
  1930.   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
  1931. }
  1932.  
  1933. /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
  1934. ///     memory location.
  1935. ///
  1936. /// \headerfile <x86intrin.h>
  1937. ///
  1938. /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
  1939. ///
  1940. /// \param __p
  1941. ///    A pointer to a memory location that will receive the float values.
  1942. /// \param __a
  1943. ///    A 128-bit vector of [4 x float] containing the values to be stored.
  1944. static __inline__ void __DEFAULT_FN_ATTRS
  1945. _mm_storel_pi(__m64 *__p, __m128 __a)
  1946. {
  1947.   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
  1948.   struct __mm_storeh_pi_struct {
  1949.     __mm_storeh_pi_v2f32 __u;
  1950.   } __attribute__((__packed__, __may_alias__));
  1951.   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
  1952. }
  1953.  
  1954. /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
  1955. ///     memory location.
  1956. ///
  1957. /// \headerfile <x86intrin.h>
  1958. ///
  1959. /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
  1960. ///
  1961. /// \param __p
  1962. ///    A pointer to a 32-bit memory location.
  1963. /// \param __a
  1964. ///    A 128-bit vector of [4 x float] containing the value to be stored.
  1965. static __inline__ void __DEFAULT_FN_ATTRS
  1966. _mm_store_ss(float *__p, __m128 __a)
  1967. {
  1968.   struct __mm_store_ss_struct {
  1969.     float __u;
  1970.   } __attribute__((__packed__, __may_alias__));
  1971.   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
  1972. }
  1973.  
  1974. /// Stores a 128-bit vector of [4 x float] to an unaligned memory
  1975. ///    location.
  1976. ///
  1977. /// \headerfile <x86intrin.h>
  1978. ///
  1979. /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
  1980. ///
  1981. /// \param __p
  1982. ///    A pointer to a 128-bit memory location. The address of the memory
  1983. ///    location does not have to be aligned.
  1984. /// \param __a
  1985. ///    A 128-bit vector of [4 x float] containing the values to be stored.
  1986. static __inline__ void __DEFAULT_FN_ATTRS
  1987. _mm_storeu_ps(float *__p, __m128 __a)
  1988. {
  1989.   struct __storeu_ps {
  1990.     __m128_u __v;
  1991.   } __attribute__((__packed__, __may_alias__));
  1992.   ((struct __storeu_ps*)__p)->__v = __a;
  1993. }
  1994.  
  1995. /// Stores a 128-bit vector of [4 x float] into an aligned memory
  1996. ///    location.
  1997. ///
  1998. /// \headerfile <x86intrin.h>
  1999. ///
  2000. /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
  2001. ///
  2002. /// \param __p
  2003. ///    A pointer to a 128-bit memory location. The address of the memory
  2004. ///    location has to be 16-byte aligned.
  2005. /// \param __a
  2006. ///    A 128-bit vector of [4 x float] containing the values to be stored.
  2007. static __inline__ void __DEFAULT_FN_ATTRS
  2008. _mm_store_ps(float *__p, __m128 __a)
  2009. {
  2010.   *(__m128*)__p = __a;
  2011. }
  2012.  
  2013. /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
  2014. ///    four contiguous elements in an aligned memory location.
  2015. ///
  2016. /// \headerfile <x86intrin.h>
  2017. ///
  2018. /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
  2019. ///    instruction.
  2020. ///
  2021. /// \param __p
  2022. ///    A pointer to a 128-bit memory location.
  2023. /// \param __a
  2024. ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
  2025. ///    of the four contiguous elements pointed by \a __p.
  2026. static __inline__ void __DEFAULT_FN_ATTRS
  2027. _mm_store1_ps(float *__p, __m128 __a)
  2028. {
  2029.   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
  2030.   _mm_store_ps(__p, __a);
  2031. }
  2032.  
  2033. /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
  2034. ///    four contiguous elements in an aligned memory location.
  2035. ///
  2036. /// \headerfile <x86intrin.h>
  2037. ///
  2038. /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
  2039. ///    instruction.
  2040. ///
  2041. /// \param __p
  2042. ///    A pointer to a 128-bit memory location.
  2043. /// \param __a
  2044. ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
  2045. ///    of the four contiguous elements pointed by \a __p.
  2046. static __inline__ void __DEFAULT_FN_ATTRS
  2047. _mm_store_ps1(float *__p, __m128 __a)
  2048. {
  2049.   _mm_store1_ps(__p, __a);
  2050. }
  2051.  
  2052. /// Stores float values from a 128-bit vector of [4 x float] to an
  2053. ///    aligned memory location in reverse order.
  2054. ///
  2055. /// \headerfile <x86intrin.h>
  2056. ///
  2057. /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
  2058. ///    instruction.
  2059. ///
  2060. /// \param __p
  2061. ///    A pointer to a 128-bit memory location. The address of the memory
  2062. ///    location has to be 128-bit aligned.
  2063. /// \param __a
  2064. ///    A 128-bit vector of [4 x float] containing the values to be stored.
  2065. static __inline__ void __DEFAULT_FN_ATTRS
  2066. _mm_storer_ps(float *__p, __m128 __a)
  2067. {
  2068.   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
  2069.   _mm_store_ps(__p, __a);
  2070. }
  2071.  
  2072. #define _MM_HINT_ET0 7
  2073. #define _MM_HINT_ET1 6
  2074. #define _MM_HINT_T0  3
  2075. #define _MM_HINT_T1  2
  2076. #define _MM_HINT_T2  1
  2077. #define _MM_HINT_NTA 0
  2078.  
  2079. #ifndef _MSC_VER
  2080. /* FIXME: We have to #define this because "sel" must be a constant integer, and
  2081.    Sema doesn't do any form of constant propagation yet. */
  2082.  
  2083. /// Loads one cache line of data from the specified address to a location
  2084. ///    closer to the processor.
  2085. ///
  2086. /// \headerfile <x86intrin.h>
  2087. ///
  2088. /// \code
  2089. /// void _mm_prefetch(const void *a, const int sel);
  2090. /// \endcode
  2091. ///
  2092. /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
  2093. ///
  2094. /// \param a
  2095. ///    A pointer to a memory location containing a cache line of data.
  2096. /// \param sel
  2097. ///    A predefined integer constant specifying the type of prefetch
  2098. ///    operation: \n
  2099. ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
  2100. ///    PREFETCHNTA instruction will be generated. \n
  2101. ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
  2102. ///    be generated. \n
  2103. ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
  2104. ///    be generated. \n
  2105. ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
  2106. ///    be generated.
  2107. #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
  2108.                                                  ((sel) >> 2) & 1, (sel) & 0x3))
  2109. #endif
  2110.  
  2111. /// Stores a 64-bit integer in the specified aligned memory location. To
  2112. ///    minimize caching, the data is flagged as non-temporal (unlikely to be
  2113. ///    used again soon).
  2114. ///
  2115. /// \headerfile <x86intrin.h>
  2116. ///
  2117. /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
  2118. ///
  2119. /// \param __p
  2120. ///    A pointer to an aligned memory location used to store the register value.
  2121. /// \param __a
  2122. ///    A 64-bit integer containing the value to be stored.
  2123. static __inline__ void __DEFAULT_FN_ATTRS_MMX
  2124. _mm_stream_pi(__m64 *__p, __m64 __a)
  2125. {
  2126.   __builtin_ia32_movntq(__p, __a);
  2127. }
  2128.  
  2129. /// Moves packed float values from a 128-bit vector of [4 x float] to a
  2130. ///    128-bit aligned memory location. To minimize caching, the data is flagged
  2131. ///    as non-temporal (unlikely to be used again soon).
  2132. ///
  2133. /// \headerfile <x86intrin.h>
  2134. ///
  2135. /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
  2136. ///
  2137. /// \param __p
  2138. ///    A pointer to a 128-bit aligned memory location that will receive the
  2139. ///    single-precision floating-point values.
  2140. /// \param __a
  2141. ///    A 128-bit vector of [4 x float] containing the values to be moved.
  2142. static __inline__ void __DEFAULT_FN_ATTRS
  2143. _mm_stream_ps(float *__p, __m128 __a)
  2144. {
  2145.   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
  2146. }
  2147.  
  2148. #if defined(__cplusplus)
  2149. extern "C" {
  2150. #endif
  2151.  
  2152. /// Forces strong memory ordering (serialization) between store
  2153. ///    instructions preceding this instruction and store instructions following
  2154. ///    this instruction, ensuring the system completes all previous stores
  2155. ///    before executing subsequent stores.
  2156. ///
  2157. /// \headerfile <x86intrin.h>
  2158. ///
  2159. /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
  2160. ///
  2161. void _mm_sfence(void);
  2162.  
  2163. #if defined(__cplusplus)
  2164. } // extern "C"
  2165. #endif
  2166.  
  2167. /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
  2168. ///    returns it, as specified by the immediate integer operand.
  2169. ///
  2170. /// \headerfile <x86intrin.h>
  2171. ///
  2172. /// \code
  2173. /// int _mm_extract_pi16(__m64 a, int n);
  2174. /// \endcode
  2175. ///
  2176. /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
  2177. ///
  2178. /// \param a
  2179. ///    A 64-bit vector of [4 x i16].
  2180. /// \param n
  2181. ///    An immediate integer operand that determines which bits are extracted: \n
  2182. ///    0: Bits [15:0] are copied to the destination. \n
  2183. ///    1: Bits [31:16] are copied to the destination. \n
  2184. ///    2: Bits [47:32] are copied to the destination. \n
  2185. ///    3: Bits [63:48] are copied to the destination.
  2186. /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
  2187. #define _mm_extract_pi16(a, n) \
  2188.   ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
  2189.  
  2190. /// Copies data from the 64-bit vector of [4 x i16] to the destination,
  2191. ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
  2192. ///    specified by the immediate operand \a n.
  2193. ///
  2194. /// \headerfile <x86intrin.h>
  2195. ///
  2196. /// \code
  2197. /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
  2198. /// \endcode
  2199. ///
  2200. /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
  2201. ///
  2202. /// \param a
  2203. ///    A 64-bit vector of [4 x i16].
  2204. /// \param d
  2205. ///    An integer. The lower 16-bit value from this operand is written to the
  2206. ///    destination at the offset specified by operand \a n.
  2207. /// \param n
  2208. ///    An immediate integer operant that determines which the bits to be used
  2209. ///    in the destination. \n
  2210. ///    0: Bits [15:0] are copied to the destination. \n
  2211. ///    1: Bits [31:16] are copied to the destination. \n
  2212. ///    2: Bits [47:32] are copied to the destination. \n
  2213. ///    3: Bits [63:48] are copied to the destination.  \n
  2214. ///    The remaining bits in the destination are copied from the corresponding
  2215. ///    bits in operand \a a.
  2216. /// \returns A 64-bit integer vector containing the copied packed data from the
  2217. ///    operands.
  2218. #define _mm_insert_pi16(a, d, n) \
  2219.   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
  2220.  
  2221. /// Compares each of the corresponding packed 16-bit integer values of
  2222. ///    the 64-bit integer vectors, and writes the greater value to the
  2223. ///    corresponding bits in the destination.
  2224. ///
  2225. /// \headerfile <x86intrin.h>
  2226. ///
  2227. /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
  2228. ///
  2229. /// \param __a
  2230. ///    A 64-bit integer vector containing one of the source operands.
  2231. /// \param __b
  2232. ///    A 64-bit integer vector containing one of the source operands.
  2233. /// \returns A 64-bit integer vector containing the comparison results.
  2234. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2235. _mm_max_pi16(__m64 __a, __m64 __b)
  2236. {
  2237.   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
  2238. }
  2239.  
  2240. /// Compares each of the corresponding packed 8-bit unsigned integer
  2241. ///    values of the 64-bit integer vectors, and writes the greater value to the
  2242. ///    corresponding bits in the destination.
  2243. ///
  2244. /// \headerfile <x86intrin.h>
  2245. ///
  2246. /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
  2247. ///
  2248. /// \param __a
  2249. ///    A 64-bit integer vector containing one of the source operands.
  2250. /// \param __b
  2251. ///    A 64-bit integer vector containing one of the source operands.
  2252. /// \returns A 64-bit integer vector containing the comparison results.
  2253. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2254. _mm_max_pu8(__m64 __a, __m64 __b)
  2255. {
  2256.   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
  2257. }
  2258.  
  2259. /// Compares each of the corresponding packed 16-bit integer values of
  2260. ///    the 64-bit integer vectors, and writes the lesser value to the
  2261. ///    corresponding bits in the destination.
  2262. ///
  2263. /// \headerfile <x86intrin.h>
  2264. ///
  2265. /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
  2266. ///
  2267. /// \param __a
  2268. ///    A 64-bit integer vector containing one of the source operands.
  2269. /// \param __b
  2270. ///    A 64-bit integer vector containing one of the source operands.
  2271. /// \returns A 64-bit integer vector containing the comparison results.
  2272. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2273. _mm_min_pi16(__m64 __a, __m64 __b)
  2274. {
  2275.   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
  2276. }
  2277.  
  2278. /// Compares each of the corresponding packed 8-bit unsigned integer
  2279. ///    values of the 64-bit integer vectors, and writes the lesser value to the
  2280. ///    corresponding bits in the destination.
  2281. ///
  2282. /// \headerfile <x86intrin.h>
  2283. ///
  2284. /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
  2285. ///
  2286. /// \param __a
  2287. ///    A 64-bit integer vector containing one of the source operands.
  2288. /// \param __b
  2289. ///    A 64-bit integer vector containing one of the source operands.
  2290. /// \returns A 64-bit integer vector containing the comparison results.
  2291. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2292. _mm_min_pu8(__m64 __a, __m64 __b)
  2293. {
  2294.   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
  2295. }
  2296.  
  2297. /// Takes the most significant bit from each 8-bit element in a 64-bit
  2298. ///    integer vector to create an 8-bit mask value. Zero-extends the value to
  2299. ///    32-bit integer and writes it to the destination.
  2300. ///
  2301. /// \headerfile <x86intrin.h>
  2302. ///
  2303. /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
  2304. ///
  2305. /// \param __a
  2306. ///    A 64-bit integer vector containing the values with bits to be extracted.
  2307. /// \returns The most significant bit from each 8-bit element in \a __a,
  2308. ///    written to bits [7:0].
  2309. static __inline__ int __DEFAULT_FN_ATTRS_MMX
  2310. _mm_movemask_pi8(__m64 __a)
  2311. {
  2312.   return __builtin_ia32_pmovmskb((__v8qi)__a);
  2313. }
  2314.  
  2315. /// Multiplies packed 16-bit unsigned integer values and writes the
  2316. ///    high-order 16 bits of each 32-bit product to the corresponding bits in
  2317. ///    the destination.
  2318. ///
  2319. /// \headerfile <x86intrin.h>
  2320. ///
  2321. /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
  2322. ///
  2323. /// \param __a
  2324. ///    A 64-bit integer vector containing one of the source operands.
  2325. /// \param __b
  2326. ///    A 64-bit integer vector containing one of the source operands.
  2327. /// \returns A 64-bit integer vector containing the products of both operands.
  2328. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2329. _mm_mulhi_pu16(__m64 __a, __m64 __b)
  2330. {
  2331.   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
  2332. }
  2333.  
  2334. /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
  2335. ///    destination, as specified by the immediate value operand.
  2336. ///
  2337. /// \headerfile <x86intrin.h>
  2338. ///
  2339. /// \code
  2340. /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
  2341. /// \endcode
  2342. ///
  2343. /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
  2344. ///
  2345. /// \param a
  2346. ///    A 64-bit integer vector containing the values to be shuffled.
  2347. /// \param n
  2348. ///    An immediate value containing an 8-bit value specifying which elements to
  2349. ///    copy from \a a. The destinations within the 64-bit destination are
  2350. ///    assigned values as follows: \n
  2351. ///    Bits [1:0] are used to assign values to bits [15:0] in the
  2352. ///    destination. \n
  2353. ///    Bits [3:2] are used to assign values to bits [31:16] in the
  2354. ///    destination. \n
  2355. ///    Bits [5:4] are used to assign values to bits [47:32] in the
  2356. ///    destination. \n
  2357. ///    Bits [7:6] are used to assign values to bits [63:48] in the
  2358. ///    destination. \n
  2359. ///    Bit value assignments: \n
  2360. ///    00: assigned from bits [15:0] of \a a. \n
  2361. ///    01: assigned from bits [31:16] of \a a. \n
  2362. ///    10: assigned from bits [47:32] of \a a. \n
  2363. ///    11: assigned from bits [63:48] of \a a. \n
  2364. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
  2365. ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
  2366. ///    <c>[b6, b4, b2, b0]</c>.
  2367. /// \returns A 64-bit integer vector containing the shuffled values.
  2368. #define _mm_shuffle_pi16(a, n) \
  2369.   ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
  2370.  
  2371. /// Conditionally copies the values from each 8-bit element in the first
  2372. ///    64-bit integer vector operand to the specified memory location, as
  2373. ///    specified by the most significant bit in the corresponding element in the
  2374. ///    second 64-bit integer vector operand.
  2375. ///
  2376. ///    To minimize caching, the data is flagged as non-temporal
  2377. ///    (unlikely to be used again soon).
  2378. ///
  2379. /// \headerfile <x86intrin.h>
  2380. ///
  2381. /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
  2382. ///
  2383. /// \param __d
  2384. ///    A 64-bit integer vector containing the values with elements to be copied.
  2385. /// \param __n
  2386. ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
  2387. ///    element determines whether the corresponding element in operand \a __d
  2388. ///    is copied. If the most significant bit of a given element is 1, the
  2389. ///    corresponding element in operand \a __d is copied.
  2390. /// \param __p
  2391. ///    A pointer to a 64-bit memory location that will receive the conditionally
  2392. ///    copied integer values. The address of the memory location does not have
  2393. ///    to be aligned.
  2394. static __inline__ void __DEFAULT_FN_ATTRS_MMX
  2395. _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
  2396. {
  2397.   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
  2398. }
  2399.  
  2400. /// Computes the rounded averages of the packed unsigned 8-bit integer
  2401. ///    values and writes the averages to the corresponding bits in the
  2402. ///    destination.
  2403. ///
  2404. /// \headerfile <x86intrin.h>
  2405. ///
  2406. /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
  2407. ///
  2408. /// \param __a
  2409. ///    A 64-bit integer vector containing one of the source operands.
  2410. /// \param __b
  2411. ///    A 64-bit integer vector containing one of the source operands.
  2412. /// \returns A 64-bit integer vector containing the averages of both operands.
  2413. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2414. _mm_avg_pu8(__m64 __a, __m64 __b)
  2415. {
  2416.   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
  2417. }
  2418.  
  2419. /// Computes the rounded averages of the packed unsigned 16-bit integer
  2420. ///    values and writes the averages to the corresponding bits in the
  2421. ///    destination.
  2422. ///
  2423. /// \headerfile <x86intrin.h>
  2424. ///
  2425. /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
  2426. ///
  2427. /// \param __a
  2428. ///    A 64-bit integer vector containing one of the source operands.
  2429. /// \param __b
  2430. ///    A 64-bit integer vector containing one of the source operands.
  2431. /// \returns A 64-bit integer vector containing the averages of both operands.
  2432. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2433. _mm_avg_pu16(__m64 __a, __m64 __b)
  2434. {
  2435.   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
  2436. }
  2437.  
  2438. /// Subtracts the corresponding 8-bit unsigned integer values of the two
  2439. ///    64-bit vector operands and computes the absolute value for each of the
  2440. ///    difference. Then sum of the 8 absolute differences is written to the
  2441. ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
  2442. ///
  2443. /// \headerfile <x86intrin.h>
  2444. ///
  2445. /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
  2446. ///
  2447. /// \param __a
  2448. ///    A 64-bit integer vector containing one of the source operands.
  2449. /// \param __b
  2450. ///    A 64-bit integer vector containing one of the source operands.
  2451. /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
  2452. ///    sets of absolute differences between both operands. The upper bits are
  2453. ///    cleared.
  2454. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2455. _mm_sad_pu8(__m64 __a, __m64 __b)
  2456. {
  2457.   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
  2458. }
  2459.  
  2460. #if defined(__cplusplus)
  2461. extern "C" {
  2462. #endif
  2463.  
  2464. /// Returns the contents of the MXCSR register as a 32-bit unsigned
  2465. ///    integer value.
  2466. ///
  2467. ///    There are several groups of macros associated with this
  2468. ///    intrinsic, including:
  2469. ///    <ul>
  2470. ///    <li>
  2471. ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
  2472. ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
  2473. ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
  2474. ///      _MM_GET_EXCEPTION_STATE().
  2475. ///    </li>
  2476. ///    <li>
  2477. ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
  2478. ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
  2479. ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
  2480. ///    </li>
  2481. ///    <li>
  2482. ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
  2483. ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
  2484. ///      _MM_GET_ROUNDING_MODE().
  2485. ///    </li>
  2486. ///    <li>
  2487. ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
  2488. ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
  2489. ///    </li>
  2490. ///    <li>
  2491. ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
  2492. ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
  2493. ///      _MM_GET_DENORMALS_ZERO_MODE().
  2494. ///    </li>
  2495. ///    </ul>
  2496. ///
  2497. ///    For example, the following expression checks if an overflow exception has
  2498. ///    occurred:
  2499. ///    \code
  2500. ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
  2501. ///    \endcode
  2502. ///
  2503. ///    The following expression gets the current rounding mode:
  2504. ///    \code
  2505. ///      _MM_GET_ROUNDING_MODE()
  2506. ///    \endcode
  2507. ///
  2508. /// \headerfile <x86intrin.h>
  2509. ///
  2510. /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
  2511. ///
  2512. /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
  2513. ///    register.
  2514. unsigned int _mm_getcsr(void);
  2515.  
  2516. /// Sets the MXCSR register with the 32-bit unsigned integer value.
  2517. ///
  2518. ///    There are several groups of macros associated with this intrinsic,
  2519. ///    including:
  2520. ///    <ul>
  2521. ///    <li>
  2522. ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
  2523. ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
  2524. ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
  2525. ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
  2526. ///    </li>
  2527. ///    <li>
  2528. ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
  2529. ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
  2530. ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
  2531. ///      of these macros.
  2532. ///    </li>
  2533. ///    <li>
  2534. ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
  2535. ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
  2536. ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
  2537. ///    </li>
  2538. ///    <li>
  2539. ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
  2540. ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
  2541. ///      one of these macros.
  2542. ///    </li>
  2543. ///    <li>
  2544. ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
  2545. ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
  2546. ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
  2547. ///    </li>
  2548. ///    </ul>
  2549. ///
  2550. ///    For example, the following expression causes subsequent floating-point
  2551. ///    operations to round up:
  2552. ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
  2553. ///
  2554. ///    The following example sets the DAZ and FTZ flags:
  2555. ///    \code
  2556. ///    void setFlags() {
  2557. ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  2558. ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  2559. ///    }
  2560. ///    \endcode
  2561. ///
  2562. /// \headerfile <x86intrin.h>
  2563. ///
  2564. /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
  2565. ///
  2566. /// \param __i
  2567. ///    A 32-bit unsigned integer value to be written to the MXCSR register.
  2568. void _mm_setcsr(unsigned int __i);
  2569.  
  2570. #if defined(__cplusplus)
  2571. } // extern "C"
  2572. #endif
  2573.  
  2574. /// Selects 4 float values from the 128-bit operands of [4 x float], as
  2575. ///    specified by the immediate value operand.
  2576. ///
  2577. /// \headerfile <x86intrin.h>
  2578. ///
  2579. /// \code
  2580. /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
  2581. /// \endcode
  2582. ///
  2583. /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
  2584. ///
  2585. /// \param a
  2586. ///    A 128-bit vector of [4 x float].
  2587. /// \param b
  2588. ///    A 128-bit vector of [4 x float].
  2589. /// \param mask
  2590. ///    An immediate value containing an 8-bit value specifying which elements to
  2591. ///    copy from \a a and \a b. \n
  2592. ///    Bits [3:0] specify the values copied from operand \a a. \n
  2593. ///    Bits [7:4] specify the values copied from operand \a b. \n
  2594. ///    The destinations within the 128-bit destination are assigned values as
  2595. ///    follows: \n
  2596. ///    Bits [1:0] are used to assign values to bits [31:0] in the
  2597. ///    destination. \n
  2598. ///    Bits [3:2] are used to assign values to bits [63:32] in the
  2599. ///    destination. \n
  2600. ///    Bits [5:4] are used to assign values to bits [95:64] in the
  2601. ///    destination. \n
  2602. ///    Bits [7:6] are used to assign values to bits [127:96] in the
  2603. ///    destination. \n
  2604. ///    Bit value assignments: \n
  2605. ///    00: Bits [31:0] copied from the specified operand. \n
  2606. ///    01: Bits [63:32] copied from the specified operand. \n
  2607. ///    10: Bits [95:64] copied from the specified operand. \n
  2608. ///    11: Bits [127:96] copied from the specified operand. \n
  2609. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
  2610. ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
  2611. ///    <c>[b6, b4, b2, b0]</c>.
  2612. /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
  2613. #define _mm_shuffle_ps(a, b, mask) \
  2614.   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
  2615.                                  (int)(mask)))
  2616.  
  2617. /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
  2618. ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
  2619. ///
  2620. /// \headerfile <x86intrin.h>
  2621. ///
  2622. /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
  2623. ///
  2624. /// \param __a
  2625. ///    A 128-bit vector of [4 x float]. \n
  2626. ///    Bits [95:64] are written to bits [31:0] of the destination. \n
  2627. ///    Bits [127:96] are written to bits [95:64] of the destination.
  2628. /// \param __b
  2629. ///    A 128-bit vector of [4 x float].
  2630. ///    Bits [95:64] are written to bits [63:32] of the destination. \n
  2631. ///    Bits [127:96] are written to bits [127:96] of the destination.
  2632. /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
  2633. static __inline__ __m128 __DEFAULT_FN_ATTRS
  2634. _mm_unpackhi_ps(__m128 __a, __m128 __b)
  2635. {
  2636.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
  2637. }
  2638.  
  2639. /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
  2640. ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
  2641. ///
  2642. /// \headerfile <x86intrin.h>
  2643. ///
  2644. /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
  2645. ///
  2646. /// \param __a
  2647. ///    A 128-bit vector of [4 x float]. \n
  2648. ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
  2649. ///    Bits [63:32] are written to bits [95:64] of the destination.
  2650. /// \param __b
  2651. ///    A 128-bit vector of [4 x float]. \n
  2652. ///    Bits [31:0] are written to bits [63:32] of the destination. \n
  2653. ///    Bits [63:32] are written to bits [127:96] of the destination.
  2654. /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
  2655. static __inline__ __m128 __DEFAULT_FN_ATTRS
  2656. _mm_unpacklo_ps(__m128 __a, __m128 __b)
  2657. {
  2658.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
  2659. }
  2660.  
  2661. /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
  2662. ///    32 bits are set to the lower 32 bits of the second parameter. The upper
  2663. ///    96 bits are set to the upper 96 bits of the first parameter.
  2664. ///
  2665. /// \headerfile <x86intrin.h>
  2666. ///
  2667. /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
  2668. ///    instruction.
  2669. ///
  2670. /// \param __a
  2671. ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
  2672. ///    written to the upper 96 bits of the result.
  2673. /// \param __b
  2674. ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
  2675. ///    written to the lower 32 bits of the result.
  2676. /// \returns A 128-bit floating-point vector of [4 x float].
  2677. static __inline__ __m128 __DEFAULT_FN_ATTRS
  2678. _mm_move_ss(__m128 __a, __m128 __b)
  2679. {
  2680.   __a[0] = __b[0];
  2681.   return __a;
  2682. }
  2683.  
  2684. /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
  2685. ///    64 bits are set to the upper 64 bits of the second parameter. The upper
  2686. ///    64 bits are set to the upper 64 bits of the first parameter.
  2687. ///
  2688. /// \headerfile <x86intrin.h>
  2689. ///
  2690. /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
  2691. ///
  2692. /// \param __a
  2693. ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
  2694. ///    written to the upper 64 bits of the result.
  2695. /// \param __b
  2696. ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
  2697. ///    written to the lower 64 bits of the result.
  2698. /// \returns A 128-bit floating-point vector of [4 x float].
  2699. static __inline__ __m128 __DEFAULT_FN_ATTRS
  2700. _mm_movehl_ps(__m128 __a, __m128 __b)
  2701. {
  2702.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
  2703. }
  2704.  
  2705. /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
  2706. ///    64 bits are set to the lower 64 bits of the first parameter. The upper
  2707. ///    64 bits are set to the lower 64 bits of the second parameter.
  2708. ///
  2709. /// \headerfile <x86intrin.h>
  2710. ///
  2711. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  2712. ///
  2713. /// \param __a
  2714. ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
  2715. ///    written to the lower 64 bits of the result.
  2716. /// \param __b
  2717. ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
  2718. ///    written to the upper 64 bits of the result.
  2719. /// \returns A 128-bit floating-point vector of [4 x float].
  2720. static __inline__ __m128 __DEFAULT_FN_ATTRS
  2721. _mm_movelh_ps(__m128 __a, __m128 __b)
  2722. {
  2723.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
  2724. }
  2725.  
  2726. /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
  2727. ///    float].
  2728. ///
  2729. /// \headerfile <x86intrin.h>
  2730. ///
  2731. /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
  2732. ///
  2733. /// \param __a
  2734. ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
  2735. ///    from the corresponding elements in this operand.
  2736. /// \returns A 128-bit vector of [4 x float] containing the copied and converted
  2737. ///    values from the operand.
  2738. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  2739. _mm_cvtpi16_ps(__m64 __a)
  2740. {
  2741.   __m64 __b, __c;
  2742.   __m128 __r;
  2743.  
  2744.   __b = _mm_setzero_si64();
  2745.   __b = _mm_cmpgt_pi16(__b, __a);
  2746.   __c = _mm_unpackhi_pi16(__a, __b);
  2747.   __r = _mm_setzero_ps();
  2748.   __r = _mm_cvtpi32_ps(__r, __c);
  2749.   __r = _mm_movelh_ps(__r, __r);
  2750.   __c = _mm_unpacklo_pi16(__a, __b);
  2751.   __r = _mm_cvtpi32_ps(__r, __c);
  2752.  
  2753.   return __r;
  2754. }
  2755.  
  2756. /// Converts a 64-bit vector of 16-bit unsigned integer values into a
  2757. ///    128-bit vector of [4 x float].
  2758. ///
  2759. /// \headerfile <x86intrin.h>
  2760. ///
  2761. /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
  2762. ///
  2763. /// \param __a
  2764. ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
  2765. ///    destination are copied from the corresponding elements in this operand.
  2766. /// \returns A 128-bit vector of [4 x float] containing the copied and converted
  2767. ///    values from the operand.
  2768. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  2769. _mm_cvtpu16_ps(__m64 __a)
  2770. {
  2771.   __m64 __b, __c;
  2772.   __m128 __r;
  2773.  
  2774.   __b = _mm_setzero_si64();
  2775.   __c = _mm_unpackhi_pi16(__a, __b);
  2776.   __r = _mm_setzero_ps();
  2777.   __r = _mm_cvtpi32_ps(__r, __c);
  2778.   __r = _mm_movelh_ps(__r, __r);
  2779.   __c = _mm_unpacklo_pi16(__a, __b);
  2780.   __r = _mm_cvtpi32_ps(__r, __c);
  2781.  
  2782.   return __r;
  2783. }
  2784.  
  2785. /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
  2786. ///    into a 128-bit vector of [4 x float].
  2787. ///
  2788. /// \headerfile <x86intrin.h>
  2789. ///
  2790. /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
  2791. ///
  2792. /// \param __a
  2793. ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
  2794. ///    from the corresponding lower 4 elements in this operand.
  2795. /// \returns A 128-bit vector of [4 x float] containing the copied and converted
  2796. ///    values from the operand.
  2797. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  2798. _mm_cvtpi8_ps(__m64 __a)
  2799. {
  2800.   __m64 __b;
  2801.  
  2802.   __b = _mm_setzero_si64();
  2803.   __b = _mm_cmpgt_pi8(__b, __a);
  2804.   __b = _mm_unpacklo_pi8(__a, __b);
  2805.  
  2806.   return _mm_cvtpi16_ps(__b);
  2807. }
  2808.  
  2809. /// Converts the lower four unsigned 8-bit integer values from a 64-bit
  2810. ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
  2811. ///
  2812. /// \headerfile <x86intrin.h>
  2813. ///
  2814. /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
  2815. ///
  2816. /// \param __a
  2817. ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
  2818. ///    destination are copied from the corresponding lower 4 elements in this
  2819. ///    operand.
  2820. /// \returns A 128-bit vector of [4 x float] containing the copied and converted
  2821. ///    values from the source operand.
  2822. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  2823. _mm_cvtpu8_ps(__m64 __a)
  2824. {
  2825.   __m64 __b;
  2826.  
  2827.   __b = _mm_setzero_si64();
  2828.   __b = _mm_unpacklo_pi8(__a, __b);
  2829.  
  2830.   return _mm_cvtpi16_ps(__b);
  2831. }
  2832.  
  2833. /// Converts the two 32-bit signed integer values from each 64-bit vector
  2834. ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
  2835. ///
  2836. /// \headerfile <x86intrin.h>
  2837. ///
  2838. /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
  2839. ///
  2840. /// \param __a
  2841. ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
  2842. ///    copied from the elements in this operand.
  2843. /// \param __b
  2844. ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
  2845. ///    copied from the elements in this operand.
  2846. /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
  2847. ///    copied and converted values from the first operand. The upper 64 bits
  2848. ///    contain the copied and converted values from the second operand.
  2849. static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
  2850. _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
  2851. {
  2852.   __m128 __c;
  2853.  
  2854.   __c = _mm_setzero_ps();
  2855.   __c = _mm_cvtpi32_ps(__c, __b);
  2856.   __c = _mm_movelh_ps(__c, __c);
  2857.  
  2858.   return _mm_cvtpi32_ps(__c, __a);
  2859. }
  2860.  
  2861. /// Converts each single-precision floating-point element of a 128-bit
  2862. ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
  2863. ///    packs the results into a 64-bit integer vector of [4 x i16].
  2864. ///
  2865. ///    If the floating-point element is NaN or infinity, or if the
  2866. ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
  2867. ///    it is converted to 0x8000. Otherwise if the floating-point element is
  2868. ///    greater than 0x7FFF, it is converted to 0x7FFF.
  2869. ///
  2870. /// \headerfile <x86intrin.h>
  2871. ///
  2872. /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
  2873. ///
  2874. /// \param __a
  2875. ///    A 128-bit floating-point vector of [4 x float].
  2876. /// \returns A 64-bit integer vector of [4 x i16] containing the converted
  2877. ///    values.
  2878. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2879. _mm_cvtps_pi16(__m128 __a)
  2880. {
  2881.   __m64 __b, __c;
  2882.  
  2883.   __b = _mm_cvtps_pi32(__a);
  2884.   __a = _mm_movehl_ps(__a, __a);
  2885.   __c = _mm_cvtps_pi32(__a);
  2886.  
  2887.   return _mm_packs_pi32(__b, __c);
  2888. }
  2889.  
  2890. /// Converts each single-precision floating-point element of a 128-bit
  2891. ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
  2892. ///    packs the results into the lower 32 bits of a 64-bit integer vector of
  2893. ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
  2894. ///
  2895. ///    If the floating-point element is NaN or infinity, or if the
  2896. ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
  2897. ///    is converted to 0x80. Otherwise if the floating-point element is greater
  2898. ///    than 0x7F, it is converted to 0x7F.
  2899. ///
  2900. /// \headerfile <x86intrin.h>
  2901. ///
  2902. /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
  2903. ///
  2904. /// \param __a
  2905. ///    128-bit floating-point vector of [4 x float].
  2906. /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
  2907. ///    converted values and the uppper 32 bits are set to zero.
  2908. static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  2909. _mm_cvtps_pi8(__m128 __a)
  2910. {
  2911.   __m64 __b, __c;
  2912.  
  2913.   __b = _mm_cvtps_pi16(__a);
  2914.   __c = _mm_setzero_si64();
  2915.  
  2916.   return _mm_packs_pi16(__b, __c);
  2917. }
  2918.  
  2919. /// Extracts the sign bits from each single-precision floating-point
  2920. ///    element of a 128-bit floating-point vector of [4 x float] and returns the
  2921. ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
  2922. ///    to zero.
  2923. ///
  2924. /// \headerfile <x86intrin.h>
  2925. ///
  2926. /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
  2927. ///
  2928. /// \param __a
  2929. ///    A 128-bit floating-point vector of [4 x float].
  2930. /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
  2931. ///    single-precision floating-point element of the parameter. Bits [31:4] are
  2932. ///    set to zero.
  2933. static __inline__ int __DEFAULT_FN_ATTRS
  2934. _mm_movemask_ps(__m128 __a)
  2935. {
  2936.   return __builtin_ia32_movmskps((__v4sf)__a);
  2937. }
  2938.  
  2939.  
  2940. #define _MM_ALIGN16 __attribute__((aligned(16)))
  2941.  
  2942. #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
  2943.  
  2944. #define _MM_EXCEPT_INVALID    (0x0001U)
  2945. #define _MM_EXCEPT_DENORM     (0x0002U)
  2946. #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
  2947. #define _MM_EXCEPT_OVERFLOW   (0x0008U)
  2948. #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
  2949. #define _MM_EXCEPT_INEXACT    (0x0020U)
  2950. #define _MM_EXCEPT_MASK       (0x003fU)
  2951.  
  2952. #define _MM_MASK_INVALID      (0x0080U)
  2953. #define _MM_MASK_DENORM       (0x0100U)
  2954. #define _MM_MASK_DIV_ZERO     (0x0200U)
  2955. #define _MM_MASK_OVERFLOW     (0x0400U)
  2956. #define _MM_MASK_UNDERFLOW    (0x0800U)
  2957. #define _MM_MASK_INEXACT      (0x1000U)
  2958. #define _MM_MASK_MASK         (0x1f80U)
  2959.  
  2960. #define _MM_ROUND_NEAREST     (0x0000U)
  2961. #define _MM_ROUND_DOWN        (0x2000U)
  2962. #define _MM_ROUND_UP          (0x4000U)
  2963. #define _MM_ROUND_TOWARD_ZERO (0x6000U)
  2964. #define _MM_ROUND_MASK        (0x6000U)
  2965.  
  2966. #define _MM_FLUSH_ZERO_MASK   (0x8000U)
  2967. #define _MM_FLUSH_ZERO_ON     (0x8000U)
  2968. #define _MM_FLUSH_ZERO_OFF    (0x0000U)
  2969.  
  2970. #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
  2971. #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
  2972. #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
  2973. #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
  2974.  
  2975. #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
  2976. #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
  2977. #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
  2978. #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
  2979.  
  2980. #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
  2981. do { \
  2982.   __m128 tmp3, tmp2, tmp1, tmp0; \
  2983.   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
  2984.   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
  2985.   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
  2986.   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
  2987.   (row0) = _mm_movelh_ps(tmp0, tmp2); \
  2988.   (row1) = _mm_movehl_ps(tmp2, tmp0); \
  2989.   (row2) = _mm_movelh_ps(tmp1, tmp3); \
  2990.   (row3) = _mm_movehl_ps(tmp3, tmp1); \
  2991. } while (0)
  2992.  
  2993. /* Aliases for compatibility. */
  2994. #define _m_pextrw _mm_extract_pi16
  2995. #define _m_pinsrw _mm_insert_pi16
  2996. #define _m_pmaxsw _mm_max_pi16
  2997. #define _m_pmaxub _mm_max_pu8
  2998. #define _m_pminsw _mm_min_pi16
  2999. #define _m_pminub _mm_min_pu8
  3000. #define _m_pmovmskb _mm_movemask_pi8
  3001. #define _m_pmulhuw _mm_mulhi_pu16
  3002. #define _m_pshufw _mm_shuffle_pi16
  3003. #define _m_maskmovq _mm_maskmove_si64
  3004. #define _m_pavgb _mm_avg_pu8
  3005. #define _m_pavgw _mm_avg_pu16
  3006. #define _m_psadbw _mm_sad_pu8
  3007. #define _m_ _mm_
  3008.  
  3009. #undef __DEFAULT_FN_ATTRS
  3010. #undef __DEFAULT_FN_ATTRS_MMX
  3011.  
  3012. /* Ugly hack for backwards-compatibility (compatible with gcc) */
  3013. #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
  3014. #include <emmintrin.h>
  3015. #endif
  3016.  
  3017. #endif /* __XMMINTRIN_H */
  3018.