Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- avxintrin.h - AVX intrinsics -------------------------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __IMMINTRIN_H
  11. #error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
  12. #endif
  13.  
  14. #ifndef __AVXINTRIN_H
  15. #define __AVXINTRIN_H
  16.  
  17. typedef double __v4df __attribute__ ((__vector_size__ (32)));
  18. typedef float __v8sf __attribute__ ((__vector_size__ (32)));
  19. typedef long long __v4di __attribute__ ((__vector_size__ (32)));
  20. typedef int __v8si __attribute__ ((__vector_size__ (32)));
  21. typedef short __v16hi __attribute__ ((__vector_size__ (32)));
  22. typedef char __v32qi __attribute__ ((__vector_size__ (32)));
  23.  
  24. /* Unsigned types */
  25. typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
  26. typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
  27. typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
  28. typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
  29.  
  30. /* We need an explicitly signed variant for char. Note that this shouldn't
  31.  * appear in the interface though. */
  32. typedef signed char __v32qs __attribute__((__vector_size__(32)));
  33.  
  34. typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
  35. typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
  36. typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
  37.  
  38. typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
  39. typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
  40. typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
  41.  
  42. #ifdef __SSE2__
  43. /* Both _Float16 and __bf16 require SSE2 being enabled. */
  44. typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
  45. typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
  46. typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
  47.  
  48. typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
  49. typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
  50. #endif
  51.  
  52. /* Define the default attributes for the functions in this file. */
  53. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
  54. #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
  55.  
  56. /* Arithmetic */
  57. /// Adds two 256-bit vectors of [4 x double].
  58. ///
  59. /// \headerfile <x86intrin.h>
  60. ///
  61. /// This intrinsic corresponds to the <c> VADDPD </c> instruction.
  62. ///
  63. /// \param __a
  64. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  65. /// \param __b
  66. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  67. /// \returns A 256-bit vector of [4 x double] containing the sums of both
  68. ///    operands.
  69. static __inline __m256d __DEFAULT_FN_ATTRS
  70. _mm256_add_pd(__m256d __a, __m256d __b)
  71. {
  72.   return (__m256d)((__v4df)__a+(__v4df)__b);
  73. }
  74.  
  75. /// Adds two 256-bit vectors of [8 x float].
  76. ///
  77. /// \headerfile <x86intrin.h>
  78. ///
  79. /// This intrinsic corresponds to the <c> VADDPS </c> instruction.
  80. ///
  81. /// \param __a
  82. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  83. /// \param __b
  84. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  85. /// \returns A 256-bit vector of [8 x float] containing the sums of both
  86. ///    operands.
  87. static __inline __m256 __DEFAULT_FN_ATTRS
  88. _mm256_add_ps(__m256 __a, __m256 __b)
  89. {
  90.   return (__m256)((__v8sf)__a+(__v8sf)__b);
  91. }
  92.  
  93. /// Subtracts two 256-bit vectors of [4 x double].
  94. ///
  95. /// \headerfile <x86intrin.h>
  96. ///
  97. /// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
  98. ///
  99. /// \param __a
  100. ///    A 256-bit vector of [4 x double] containing the minuend.
  101. /// \param __b
  102. ///    A 256-bit vector of [4 x double] containing the subtrahend.
  103. /// \returns A 256-bit vector of [4 x double] containing the differences between
  104. ///    both operands.
  105. static __inline __m256d __DEFAULT_FN_ATTRS
  106. _mm256_sub_pd(__m256d __a, __m256d __b)
  107. {
  108.   return (__m256d)((__v4df)__a-(__v4df)__b);
  109. }
  110.  
  111. /// Subtracts two 256-bit vectors of [8 x float].
  112. ///
  113. /// \headerfile <x86intrin.h>
  114. ///
  115. /// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
  116. ///
  117. /// \param __a
  118. ///    A 256-bit vector of [8 x float] containing the minuend.
  119. /// \param __b
  120. ///    A 256-bit vector of [8 x float] containing the subtrahend.
  121. /// \returns A 256-bit vector of [8 x float] containing the differences between
  122. ///    both operands.
  123. static __inline __m256 __DEFAULT_FN_ATTRS
  124. _mm256_sub_ps(__m256 __a, __m256 __b)
  125. {
  126.   return (__m256)((__v8sf)__a-(__v8sf)__b);
  127. }
  128.  
  129. /// Adds the even-indexed values and subtracts the odd-indexed values of
  130. ///    two 256-bit vectors of [4 x double].
  131. ///
  132. /// \headerfile <x86intrin.h>
  133. ///
  134. /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
  135. ///
  136. /// \param __a
  137. ///    A 256-bit vector of [4 x double] containing the left source operand.
  138. /// \param __b
  139. ///    A 256-bit vector of [4 x double] containing the right source operand.
  140. /// \returns A 256-bit vector of [4 x double] containing the alternating sums
  141. ///    and differences between both operands.
  142. static __inline __m256d __DEFAULT_FN_ATTRS
  143. _mm256_addsub_pd(__m256d __a, __m256d __b)
  144. {
  145.   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
  146. }
  147.  
  148. /// Adds the even-indexed values and subtracts the odd-indexed values of
  149. ///    two 256-bit vectors of [8 x float].
  150. ///
  151. /// \headerfile <x86intrin.h>
  152. ///
  153. /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
  154. ///
  155. /// \param __a
  156. ///    A 256-bit vector of [8 x float] containing the left source operand.
  157. /// \param __b
  158. ///    A 256-bit vector of [8 x float] containing the right source operand.
  159. /// \returns A 256-bit vector of [8 x float] containing the alternating sums and
  160. ///    differences between both operands.
  161. static __inline __m256 __DEFAULT_FN_ATTRS
  162. _mm256_addsub_ps(__m256 __a, __m256 __b)
  163. {
  164.   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
  165. }
  166.  
  167. /// Divides two 256-bit vectors of [4 x double].
  168. ///
  169. /// \headerfile <x86intrin.h>
  170. ///
  171. /// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
  172. ///
  173. /// \param __a
  174. ///    A 256-bit vector of [4 x double] containing the dividend.
  175. /// \param __b
  176. ///    A 256-bit vector of [4 x double] containing the divisor.
  177. /// \returns A 256-bit vector of [4 x double] containing the quotients of both
  178. ///    operands.
  179. static __inline __m256d __DEFAULT_FN_ATTRS
  180. _mm256_div_pd(__m256d __a, __m256d __b)
  181. {
  182.   return (__m256d)((__v4df)__a/(__v4df)__b);
  183. }
  184.  
  185. /// Divides two 256-bit vectors of [8 x float].
  186. ///
  187. /// \headerfile <x86intrin.h>
  188. ///
  189. /// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
  190. ///
  191. /// \param __a
  192. ///    A 256-bit vector of [8 x float] containing the dividend.
  193. /// \param __b
  194. ///    A 256-bit vector of [8 x float] containing the divisor.
  195. /// \returns A 256-bit vector of [8 x float] containing the quotients of both
  196. ///    operands.
  197. static __inline __m256 __DEFAULT_FN_ATTRS
  198. _mm256_div_ps(__m256 __a, __m256 __b)
  199. {
  200.   return (__m256)((__v8sf)__a/(__v8sf)__b);
  201. }
  202.  
  203. /// Compares two 256-bit vectors of [4 x double] and returns the greater
  204. ///    of each pair of values.
  205. ///
  206. /// \headerfile <x86intrin.h>
  207. ///
  208. /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
  209. ///
  210. /// \param __a
  211. ///    A 256-bit vector of [4 x double] containing one of the operands.
  212. /// \param __b
  213. ///    A 256-bit vector of [4 x double] containing one of the operands.
  214. /// \returns A 256-bit vector of [4 x double] containing the maximum values
  215. ///    between both operands.
  216. static __inline __m256d __DEFAULT_FN_ATTRS
  217. _mm256_max_pd(__m256d __a, __m256d __b)
  218. {
  219.   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
  220. }
  221.  
  222. /// Compares two 256-bit vectors of [8 x float] and returns the greater
  223. ///    of each pair of values.
  224. ///
  225. /// \headerfile <x86intrin.h>
  226. ///
  227. /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
  228. ///
  229. /// \param __a
  230. ///    A 256-bit vector of [8 x float] containing one of the operands.
  231. /// \param __b
  232. ///    A 256-bit vector of [8 x float] containing one of the operands.
  233. /// \returns A 256-bit vector of [8 x float] containing the maximum values
  234. ///    between both operands.
  235. static __inline __m256 __DEFAULT_FN_ATTRS
  236. _mm256_max_ps(__m256 __a, __m256 __b)
  237. {
  238.   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
  239. }
  240.  
  241. /// Compares two 256-bit vectors of [4 x double] and returns the lesser
  242. ///    of each pair of values.
  243. ///
  244. /// \headerfile <x86intrin.h>
  245. ///
  246. /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
  247. ///
  248. /// \param __a
  249. ///    A 256-bit vector of [4 x double] containing one of the operands.
  250. /// \param __b
  251. ///    A 256-bit vector of [4 x double] containing one of the operands.
  252. /// \returns A 256-bit vector of [4 x double] containing the minimum values
  253. ///    between both operands.
  254. static __inline __m256d __DEFAULT_FN_ATTRS
  255. _mm256_min_pd(__m256d __a, __m256d __b)
  256. {
  257.   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
  258. }
  259.  
  260. /// Compares two 256-bit vectors of [8 x float] and returns the lesser
  261. ///    of each pair of values.
  262. ///
  263. /// \headerfile <x86intrin.h>
  264. ///
  265. /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
  266. ///
  267. /// \param __a
  268. ///    A 256-bit vector of [8 x float] containing one of the operands.
  269. /// \param __b
  270. ///    A 256-bit vector of [8 x float] containing one of the operands.
  271. /// \returns A 256-bit vector of [8 x float] containing the minimum values
  272. ///    between both operands.
  273. static __inline __m256 __DEFAULT_FN_ATTRS
  274. _mm256_min_ps(__m256 __a, __m256 __b)
  275. {
  276.   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
  277. }
  278.  
  279. /// Multiplies two 256-bit vectors of [4 x double].
  280. ///
  281. /// \headerfile <x86intrin.h>
  282. ///
  283. /// This intrinsic corresponds to the <c> VMULPD </c> instruction.
  284. ///
  285. /// \param __a
  286. ///    A 256-bit vector of [4 x double] containing one of the operands.
  287. /// \param __b
  288. ///    A 256-bit vector of [4 x double] containing one of the operands.
  289. /// \returns A 256-bit vector of [4 x double] containing the products of both
  290. ///    operands.
  291. static __inline __m256d __DEFAULT_FN_ATTRS
  292. _mm256_mul_pd(__m256d __a, __m256d __b)
  293. {
  294.   return (__m256d)((__v4df)__a * (__v4df)__b);
  295. }
  296.  
  297. /// Multiplies two 256-bit vectors of [8 x float].
  298. ///
  299. /// \headerfile <x86intrin.h>
  300. ///
  301. /// This intrinsic corresponds to the <c> VMULPS </c> instruction.
  302. ///
  303. /// \param __a
  304. ///    A 256-bit vector of [8 x float] containing one of the operands.
  305. /// \param __b
  306. ///    A 256-bit vector of [8 x float] containing one of the operands.
  307. /// \returns A 256-bit vector of [8 x float] containing the products of both
  308. ///    operands.
  309. static __inline __m256 __DEFAULT_FN_ATTRS
  310. _mm256_mul_ps(__m256 __a, __m256 __b)
  311. {
  312.   return (__m256)((__v8sf)__a * (__v8sf)__b);
  313. }
  314.  
  315. /// Calculates the square roots of the values in a 256-bit vector of
  316. ///    [4 x double].
  317. ///
  318. /// \headerfile <x86intrin.h>
  319. ///
  320. /// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
  321. ///
  322. /// \param __a
  323. ///    A 256-bit vector of [4 x double].
  324. /// \returns A 256-bit vector of [4 x double] containing the square roots of the
  325. ///    values in the operand.
  326. static __inline __m256d __DEFAULT_FN_ATTRS
  327. _mm256_sqrt_pd(__m256d __a)
  328. {
  329.   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
  330. }
  331.  
  332. /// Calculates the square roots of the values in a 256-bit vector of
  333. ///    [8 x float].
  334. ///
  335. /// \headerfile <x86intrin.h>
  336. ///
  337. /// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
  338. ///
  339. /// \param __a
  340. ///    A 256-bit vector of [8 x float].
  341. /// \returns A 256-bit vector of [8 x float] containing the square roots of the
  342. ///    values in the operand.
  343. static __inline __m256 __DEFAULT_FN_ATTRS
  344. _mm256_sqrt_ps(__m256 __a)
  345. {
  346.   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
  347. }
  348.  
  349. /// Calculates the reciprocal square roots of the values in a 256-bit
  350. ///    vector of [8 x float].
  351. ///
  352. /// \headerfile <x86intrin.h>
  353. ///
  354. /// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
  355. ///
  356. /// \param __a
  357. ///    A 256-bit vector of [8 x float].
  358. /// \returns A 256-bit vector of [8 x float] containing the reciprocal square
  359. ///    roots of the values in the operand.
  360. static __inline __m256 __DEFAULT_FN_ATTRS
  361. _mm256_rsqrt_ps(__m256 __a)
  362. {
  363.   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
  364. }
  365.  
  366. /// Calculates the reciprocals of the values in a 256-bit vector of
  367. ///    [8 x float].
  368. ///
  369. /// \headerfile <x86intrin.h>
  370. ///
  371. /// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
  372. ///
  373. /// \param __a
  374. ///    A 256-bit vector of [8 x float].
  375. /// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
  376. ///    values in the operand.
  377. static __inline __m256 __DEFAULT_FN_ATTRS
  378. _mm256_rcp_ps(__m256 __a)
  379. {
  380.   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
  381. }
  382.  
  383. /// Rounds the values in a 256-bit vector of [4 x double] as specified
  384. ///    by the byte operand. The source values are rounded to integer values and
  385. ///    returned as 64-bit double-precision floating-point values.
  386. ///
  387. /// \headerfile <x86intrin.h>
  388. ///
  389. /// \code
  390. /// __m256d _mm256_round_pd(__m256d V, const int M);
  391. /// \endcode
  392. ///
  393. /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
  394. ///
  395. /// \param V
  396. ///    A 256-bit vector of [4 x double].
  397. /// \param M
  398. ///    An integer value that specifies the rounding operation. \n
  399. ///    Bits [7:4] are reserved. \n
  400. ///    Bit [3] is a precision exception value: \n
  401. ///      0: A normal PE exception is used. \n
  402. ///      1: The PE field is not updated. \n
  403. ///    Bit [2] is the rounding control source: \n
  404. ///      0: Use bits [1:0] of \a M. \n
  405. ///      1: Use the current MXCSR setting. \n
  406. ///    Bits [1:0] contain the rounding control definition: \n
  407. ///      00: Nearest. \n
  408. ///      01: Downward (toward negative infinity). \n
  409. ///      10: Upward (toward positive infinity). \n
  410. ///      11: Truncated.
  411. /// \returns A 256-bit vector of [4 x double] containing the rounded values.
  412. #define _mm256_round_pd(V, M) \
  413.   ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
  414.  
  415. /// Rounds the values stored in a 256-bit vector of [8 x float] as
  416. ///    specified by the byte operand. The source values are rounded to integer
  417. ///    values and returned as floating-point values.
  418. ///
  419. /// \headerfile <x86intrin.h>
  420. ///
  421. /// \code
  422. /// __m256 _mm256_round_ps(__m256 V, const int M);
  423. /// \endcode
  424. ///
  425. /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
  426. ///
  427. /// \param V
  428. ///    A 256-bit vector of [8 x float].
  429. /// \param M
  430. ///    An integer value that specifies the rounding operation. \n
  431. ///    Bits [7:4] are reserved. \n
  432. ///    Bit [3] is a precision exception value: \n
  433. ///      0: A normal PE exception is used. \n
  434. ///      1: The PE field is not updated. \n
  435. ///    Bit [2] is the rounding control source: \n
  436. ///      0: Use bits [1:0] of \a M. \n
  437. ///      1: Use the current MXCSR setting. \n
  438. ///    Bits [1:0] contain the rounding control definition: \n
  439. ///      00: Nearest. \n
  440. ///      01: Downward (toward negative infinity). \n
  441. ///      10: Upward (toward positive infinity). \n
  442. ///      11: Truncated.
  443. /// \returns A 256-bit vector of [8 x float] containing the rounded values.
  444. #define _mm256_round_ps(V, M) \
  445.   ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
  446.  
  447. /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
  448. ///    source values are rounded up to integer values and returned as 64-bit
  449. ///    double-precision floating-point values.
  450. ///
  451. /// \headerfile <x86intrin.h>
  452. ///
  453. /// \code
  454. /// __m256d _mm256_ceil_pd(__m256d V);
  455. /// \endcode
  456. ///
  457. /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
  458. ///
  459. /// \param V
  460. ///    A 256-bit vector of [4 x double].
  461. /// \returns A 256-bit vector of [4 x double] containing the rounded up values.
  462. #define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
  463.  
  464. /// Rounds down the values stored in a 256-bit vector of [4 x double].
  465. ///    The source values are rounded down to integer values and returned as
  466. ///    64-bit double-precision floating-point values.
  467. ///
  468. /// \headerfile <x86intrin.h>
  469. ///
  470. /// \code
  471. /// __m256d _mm256_floor_pd(__m256d V);
  472. /// \endcode
  473. ///
  474. /// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
  475. ///
  476. /// \param V
  477. ///    A 256-bit vector of [4 x double].
  478. /// \returns A 256-bit vector of [4 x double] containing the rounded down
  479. ///    values.
  480. #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
  481.  
  482. /// Rounds up the values stored in a 256-bit vector of [8 x float]. The
  483. ///    source values are rounded up to integer values and returned as
  484. ///    floating-point values.
  485. ///
  486. /// \headerfile <x86intrin.h>
  487. ///
  488. /// \code
  489. /// __m256 _mm256_ceil_ps(__m256 V);
  490. /// \endcode
  491. ///
  492. /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
  493. ///
  494. /// \param V
  495. ///    A 256-bit vector of [8 x float].
  496. /// \returns A 256-bit vector of [8 x float] containing the rounded up values.
  497. #define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
  498.  
  499. /// Rounds down the values stored in a 256-bit vector of [8 x float]. The
  500. ///    source values are rounded down to integer values and returned as
  501. ///    floating-point values.
  502. ///
  503. /// \headerfile <x86intrin.h>
  504. ///
  505. /// \code
  506. /// __m256 _mm256_floor_ps(__m256 V);
  507. /// \endcode
  508. ///
  509. /// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
  510. ///
  511. /// \param V
  512. ///    A 256-bit vector of [8 x float].
  513. /// \returns A 256-bit vector of [8 x float] containing the rounded down values.
  514. #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
  515.  
  516. /* Logical */
  517. /// Performs a bitwise AND of two 256-bit vectors of [4 x double].
  518. ///
  519. /// \headerfile <x86intrin.h>
  520. ///
  521. /// This intrinsic corresponds to the <c> VANDPD </c> instruction.
  522. ///
  523. /// \param __a
  524. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  525. /// \param __b
  526. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  527. /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
  528. ///    values between both operands.
  529. static __inline __m256d __DEFAULT_FN_ATTRS
  530. _mm256_and_pd(__m256d __a, __m256d __b)
  531. {
  532.   return (__m256d)((__v4du)__a & (__v4du)__b);
  533. }
  534.  
  535. /// Performs a bitwise AND of two 256-bit vectors of [8 x float].
  536. ///
  537. /// \headerfile <x86intrin.h>
  538. ///
  539. /// This intrinsic corresponds to the <c> VANDPS </c> instruction.
  540. ///
  541. /// \param __a
  542. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  543. /// \param __b
  544. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  545. /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
  546. ///    values between both operands.
  547. static __inline __m256 __DEFAULT_FN_ATTRS
  548. _mm256_and_ps(__m256 __a, __m256 __b)
  549. {
  550.   return (__m256)((__v8su)__a & (__v8su)__b);
  551. }
  552.  
  553. /// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
  554. ///    the one's complement of the values contained in the first source operand.
  555. ///
  556. /// \headerfile <x86intrin.h>
  557. ///
  558. /// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
  559. ///
  560. /// \param __a
  561. ///    A 256-bit vector of [4 x double] containing the left source operand. The
  562. ///    one's complement of this value is used in the bitwise AND.
  563. /// \param __b
  564. ///    A 256-bit vector of [4 x double] containing the right source operand.
  565. /// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
  566. ///    values of the second operand and the one's complement of the first
  567. ///    operand.
  568. static __inline __m256d __DEFAULT_FN_ATTRS
  569. _mm256_andnot_pd(__m256d __a, __m256d __b)
  570. {
  571.   return (__m256d)(~(__v4du)__a & (__v4du)__b);
  572. }
  573.  
  574. /// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
  575. ///    the one's complement of the values contained in the first source operand.
  576. ///
  577. /// \headerfile <x86intrin.h>
  578. ///
  579. /// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
  580. ///
  581. /// \param __a
  582. ///    A 256-bit vector of [8 x float] containing the left source operand. The
  583. ///    one's complement of this value is used in the bitwise AND.
  584. /// \param __b
  585. ///    A 256-bit vector of [8 x float] containing the right source operand.
  586. /// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
  587. ///    values of the second operand and the one's complement of the first
  588. ///    operand.
  589. static __inline __m256 __DEFAULT_FN_ATTRS
  590. _mm256_andnot_ps(__m256 __a, __m256 __b)
  591. {
  592.   return (__m256)(~(__v8su)__a & (__v8su)__b);
  593. }
  594.  
  595. /// Performs a bitwise OR of two 256-bit vectors of [4 x double].
  596. ///
  597. /// \headerfile <x86intrin.h>
  598. ///
  599. /// This intrinsic corresponds to the <c> VORPD </c> instruction.
  600. ///
  601. /// \param __a
  602. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  603. /// \param __b
  604. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  605. /// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
  606. ///    values between both operands.
  607. static __inline __m256d __DEFAULT_FN_ATTRS
  608. _mm256_or_pd(__m256d __a, __m256d __b)
  609. {
  610.   return (__m256d)((__v4du)__a | (__v4du)__b);
  611. }
  612.  
  613. /// Performs a bitwise OR of two 256-bit vectors of [8 x float].
  614. ///
  615. /// \headerfile <x86intrin.h>
  616. ///
  617. /// This intrinsic corresponds to the <c> VORPS </c> instruction.
  618. ///
  619. /// \param __a
  620. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  621. /// \param __b
  622. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  623. /// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
  624. ///    values between both operands.
  625. static __inline __m256 __DEFAULT_FN_ATTRS
  626. _mm256_or_ps(__m256 __a, __m256 __b)
  627. {
  628.   return (__m256)((__v8su)__a | (__v8su)__b);
  629. }
  630.  
  631. /// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
  632. ///
  633. /// \headerfile <x86intrin.h>
  634. ///
  635. /// This intrinsic corresponds to the <c> VXORPD </c> instruction.
  636. ///
  637. /// \param __a
  638. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  639. /// \param __b
  640. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  641. /// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
  642. ///    values between both operands.
  643. static __inline __m256d __DEFAULT_FN_ATTRS
  644. _mm256_xor_pd(__m256d __a, __m256d __b)
  645. {
  646.   return (__m256d)((__v4du)__a ^ (__v4du)__b);
  647. }
  648.  
  649. /// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
  650. ///
  651. /// \headerfile <x86intrin.h>
  652. ///
  653. /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
  654. ///
  655. /// \param __a
  656. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  657. /// \param __b
  658. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  659. /// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
  660. ///    values between both operands.
  661. static __inline __m256 __DEFAULT_FN_ATTRS
  662. _mm256_xor_ps(__m256 __a, __m256 __b)
  663. {
  664.   return (__m256)((__v8su)__a ^ (__v8su)__b);
  665. }
  666.  
  667. /* Horizontal arithmetic */
  668. /// Horizontally adds the adjacent pairs of values contained in two
  669. ///    256-bit vectors of [4 x double].
  670. ///
  671. /// \headerfile <x86intrin.h>
  672. ///
  673. /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
  674. ///
  675. /// \param __a
  676. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  677. ///    The horizontal sums of the values are returned in the even-indexed
  678. ///    elements of a vector of [4 x double].
  679. /// \param __b
  680. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  681. ///    The horizontal sums of the values are returned in the odd-indexed
  682. ///    elements of a vector of [4 x double].
  683. /// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
  684. ///    both operands.
  685. static __inline __m256d __DEFAULT_FN_ATTRS
  686. _mm256_hadd_pd(__m256d __a, __m256d __b)
  687. {
  688.   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
  689. }
  690.  
  691. /// Horizontally adds the adjacent pairs of values contained in two
  692. ///    256-bit vectors of [8 x float].
  693. ///
  694. /// \headerfile <x86intrin.h>
  695. ///
  696. /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
  697. ///
  698. /// \param __a
  699. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  700. ///    The horizontal sums of the values are returned in the elements with
  701. ///    index 0, 1, 4, 5 of a vector of [8 x float].
  702. /// \param __b
  703. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  704. ///    The horizontal sums of the values are returned in the elements with
  705. ///    index 2, 3, 6, 7 of a vector of [8 x float].
  706. /// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
  707. ///    both operands.
  708. static __inline __m256 __DEFAULT_FN_ATTRS
  709. _mm256_hadd_ps(__m256 __a, __m256 __b)
  710. {
  711.   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
  712. }
  713.  
  714. /// Horizontally subtracts the adjacent pairs of values contained in two
  715. ///    256-bit vectors of [4 x double].
  716. ///
  717. /// \headerfile <x86intrin.h>
  718. ///
  719. /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
  720. ///
  721. /// \param __a
  722. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  723. ///    The horizontal differences between the values are returned in the
  724. ///    even-indexed elements of a vector of [4 x double].
  725. /// \param __b
  726. ///    A 256-bit vector of [4 x double] containing one of the source operands.
  727. ///    The horizontal differences between the values are returned in the
  728. ///    odd-indexed elements of a vector of [4 x double].
  729. /// \returns A 256-bit vector of [4 x double] containing the horizontal
  730. ///    differences of both operands.
  731. static __inline __m256d __DEFAULT_FN_ATTRS
  732. _mm256_hsub_pd(__m256d __a, __m256d __b)
  733. {
  734.   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
  735. }
  736.  
  737. /// Horizontally subtracts the adjacent pairs of values contained in two
  738. ///    256-bit vectors of [8 x float].
  739. ///
  740. /// \headerfile <x86intrin.h>
  741. ///
  742. /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
  743. ///
  744. /// \param __a
  745. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  746. ///    The horizontal differences between the values are returned in the
  747. ///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
  748. /// \param __b
  749. ///    A 256-bit vector of [8 x float] containing one of the source operands.
  750. ///    The horizontal differences between the values are returned in the
  751. ///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
  752. /// \returns A 256-bit vector of [8 x float] containing the horizontal
  753. ///    differences of both operands.
  754. static __inline __m256 __DEFAULT_FN_ATTRS
  755. _mm256_hsub_ps(__m256 __a, __m256 __b)
  756. {
  757.   return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
  758. }
  759.  
  760. /* Vector permutations */
  761. /// Copies the values in a 128-bit vector of [2 x double] as specified
  762. ///    by the 128-bit integer vector operand.
  763. ///
  764. /// \headerfile <x86intrin.h>
  765. ///
  766. /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
  767. ///
  768. /// \param __a
  769. ///    A 128-bit vector of [2 x double].
  770. /// \param __c
  771. ///    A 128-bit integer vector operand specifying how the values are to be
  772. ///    copied. \n
  773. ///    Bit [1]: \n
  774. ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
  775. ///         vector. \n
  776. ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
  777. ///         returned vector. \n
  778. ///    Bit [65]: \n
  779. ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
  780. ///         returned vector. \n
  781. ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
  782. ///         returned vector.
  783. /// \returns A 128-bit vector of [2 x double] containing the copied values.
  784. static __inline __m128d __DEFAULT_FN_ATTRS128
  785. _mm_permutevar_pd(__m128d __a, __m128i __c)
  786. {
  787.   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
  788. }
  789.  
  790. /// Copies the values in a 256-bit vector of [4 x double] as specified
  791. ///    by the 256-bit integer vector operand.
  792. ///
  793. /// \headerfile <x86intrin.h>
  794. ///
  795. /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
  796. ///
  797. /// \param __a
  798. ///    A 256-bit vector of [4 x double].
  799. /// \param __c
  800. ///    A 256-bit integer vector operand specifying how the values are to be
  801. ///    copied. \n
  802. ///    Bit [1]: \n
  803. ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
  804. ///         vector. \n
  805. ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
  806. ///         returned vector. \n
  807. ///    Bit [65]: \n
  808. ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
  809. ///         returned vector. \n
  810. ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
  811. ///         returned vector. \n
  812. ///    Bit [129]: \n
  813. ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
  814. ///         returned vector. \n
  815. ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
  816. ///         returned vector. \n
  817. ///    Bit [193]: \n
  818. ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
  819. ///         returned vector. \n
  820. ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
  821. ///    returned vector.
  822. /// \returns A 256-bit vector of [4 x double] containing the copied values.
  823. static __inline __m256d __DEFAULT_FN_ATTRS
  824. _mm256_permutevar_pd(__m256d __a, __m256i __c)
  825. {
  826.   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
  827. }
  828.  
  829. /// Copies the values stored in a 128-bit vector of [4 x float] as
  830. ///    specified by the 128-bit integer vector operand.
  831. /// \headerfile <x86intrin.h>
  832. ///
  833. /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
  834. ///
  835. /// \param __a
  836. ///    A 128-bit vector of [4 x float].
  837. /// \param __c
  838. ///    A 128-bit integer vector operand specifying how the values are to be
  839. ///    copied. \n
  840. ///    Bits [1:0]: \n
  841. ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
  842. ///          returned vector. \n
  843. ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
  844. ///          returned vector. \n
  845. ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
  846. ///          returned vector. \n
  847. ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
  848. ///          returned vector. \n
  849. ///    Bits [33:32]: \n
  850. ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
  851. ///          returned vector. \n
  852. ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
  853. ///          returned vector. \n
  854. ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
  855. ///          returned vector. \n
  856. ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
  857. ///          returned vector. \n
  858. ///    Bits [65:64]: \n
  859. ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
  860. ///          returned vector. \n
  861. ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
  862. ///          returned vector. \n
  863. ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
  864. ///          returned vector. \n
  865. ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
  866. ///          returned vector. \n
  867. ///    Bits [97:96]: \n
  868. ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
  869. ///          returned vector. \n
  870. ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
  871. ///          returned vector. \n
  872. ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
  873. ///          returned vector. \n
  874. ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
  875. ///          returned vector.
  876. /// \returns A 128-bit vector of [4 x float] containing the copied values.
  877. static __inline __m128 __DEFAULT_FN_ATTRS128
  878. _mm_permutevar_ps(__m128 __a, __m128i __c)
  879. {
  880.   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
  881. }
  882.  
  883. /// Copies the values stored in a 256-bit vector of [8 x float] as
  884. ///    specified by the 256-bit integer vector operand.
  885. ///
  886. /// \headerfile <x86intrin.h>
  887. ///
  888. /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
  889. ///
  890. /// \param __a
  891. ///    A 256-bit vector of [8 x float].
  892. /// \param __c
  893. ///    A 256-bit integer vector operand specifying how the values are to be
  894. ///    copied. \n
  895. ///    Bits [1:0]: \n
  896. ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
  897. ///          returned vector. \n
  898. ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
  899. ///          returned vector. \n
  900. ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
  901. ///          returned vector. \n
  902. ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
  903. ///          returned vector. \n
  904. ///    Bits [33:32]: \n
  905. ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
  906. ///          returned vector. \n
  907. ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
  908. ///          returned vector. \n
  909. ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
  910. ///          returned vector. \n
  911. ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
  912. ///          returned vector. \n
  913. ///    Bits [65:64]: \n
  914. ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
  915. ///          returned vector. \n
  916. ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
  917. ///          returned vector. \n
  918. ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
  919. ///          returned vector. \n
  920. ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
  921. ///          returned vector. \n
  922. ///    Bits [97:96]: \n
  923. ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
  924. ///          returned vector. \n
  925. ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
  926. ///          returned vector. \n
  927. ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
  928. ///          returned vector. \n
  929. ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
  930. ///          returned vector. \n
  931. ///    Bits [129:128]: \n
  932. ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
  933. ///          returned vector. \n
  934. ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
  935. ///          returned vector. \n
  936. ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
  937. ///          returned vector. \n
  938. ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
  939. ///          returned vector. \n
  940. ///    Bits [161:160]: \n
  941. ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
  942. ///          returned vector. \n
  943. ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
  944. ///          returned vector. \n
  945. ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
  946. ///          returned vector. \n
  947. ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
  948. ///          returned vector. \n
  949. ///    Bits [193:192]: \n
  950. ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
  951. ///          returned vector. \n
  952. ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
  953. ///          returned vector. \n
  954. ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
  955. ///          returned vector. \n
  956. ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
  957. ///          returned vector. \n
  958. ///    Bits [225:224]: \n
  959. ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
  960. ///          returned vector. \n
  961. ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
  962. ///          returned vector. \n
  963. ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
  964. ///          returned vector. \n
  965. ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
  966. ///          returned vector.
  967. /// \returns A 256-bit vector of [8 x float] containing the copied values.
  968. static __inline __m256 __DEFAULT_FN_ATTRS
  969. _mm256_permutevar_ps(__m256 __a, __m256i __c)
  970. {
  971.   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
  972. }
  973.  
  974. /// Copies the values in a 128-bit vector of [2 x double] as specified
  975. ///    by the immediate integer operand.
  976. ///
  977. /// \headerfile <x86intrin.h>
  978. ///
  979. /// \code
  980. /// __m128d _mm_permute_pd(__m128d A, const int C);
  981. /// \endcode
  982. ///
  983. /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
  984. ///
  985. /// \param A
  986. ///    A 128-bit vector of [2 x double].
  987. /// \param C
  988. ///    An immediate integer operand specifying how the values are to be
  989. ///    copied. \n
  990. ///    Bit [0]: \n
  991. ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
  992. ///         vector. \n
  993. ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
  994. ///         returned vector. \n
  995. ///    Bit [1]: \n
  996. ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
  997. ///         returned vector. \n
  998. ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
  999. ///         returned vector.
  1000. /// \returns A 128-bit vector of [2 x double] containing the copied values.
  1001. #define _mm_permute_pd(A, C) \
  1002.   ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
  1003.  
  1004. /// Copies the values in a 256-bit vector of [4 x double] as specified by
  1005. ///    the immediate integer operand.
  1006. ///
  1007. /// \headerfile <x86intrin.h>
  1008. ///
  1009. /// \code
  1010. /// __m256d _mm256_permute_pd(__m256d A, const int C);
  1011. /// \endcode
  1012. ///
  1013. /// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
  1014. ///
  1015. /// \param A
  1016. ///    A 256-bit vector of [4 x double].
  1017. /// \param C
  1018. ///    An immediate integer operand specifying how the values are to be
  1019. ///    copied. \n
  1020. ///    Bit [0]: \n
  1021. ///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
  1022. ///         vector. \n
  1023. ///      1: Bits [127:64] of the source are copied to bits [63:0] of the
  1024. ///         returned vector. \n
  1025. ///    Bit [1]: \n
  1026. ///      0: Bits [63:0] of the source are copied to bits [127:64] of the
  1027. ///         returned vector. \n
  1028. ///      1: Bits [127:64] of the source are copied to bits [127:64] of the
  1029. ///         returned vector. \n
  1030. ///    Bit [2]: \n
  1031. ///      0: Bits [191:128] of the source are copied to bits [191:128] of the
  1032. ///         returned vector. \n
  1033. ///      1: Bits [255:192] of the source are copied to bits [191:128] of the
  1034. ///         returned vector. \n
  1035. ///    Bit [3]: \n
  1036. ///      0: Bits [191:128] of the source are copied to bits [255:192] of the
  1037. ///         returned vector. \n
  1038. ///      1: Bits [255:192] of the source are copied to bits [255:192] of the
  1039. ///         returned vector.
  1040. /// \returns A 256-bit vector of [4 x double] containing the copied values.
  1041. #define _mm256_permute_pd(A, C) \
  1042.   ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
  1043.  
  1044. /// Copies the values in a 128-bit vector of [4 x float] as specified by
  1045. ///    the immediate integer operand.
  1046. ///
  1047. /// \headerfile <x86intrin.h>
  1048. ///
  1049. /// \code
  1050. /// __m128 _mm_permute_ps(__m128 A, const int C);
  1051. /// \endcode
  1052. ///
  1053. /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
  1054. ///
  1055. /// \param A
  1056. ///    A 128-bit vector of [4 x float].
  1057. /// \param C
  1058. ///    An immediate integer operand specifying how the values are to be
  1059. ///    copied. \n
  1060. ///    Bits [1:0]: \n
  1061. ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
  1062. ///          returned vector. \n
  1063. ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
  1064. ///          returned vector. \n
  1065. ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
  1066. ///          returned vector. \n
  1067. ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
  1068. ///          returned vector. \n
  1069. ///    Bits [3:2]: \n
  1070. ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
  1071. ///          returned vector. \n
  1072. ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
  1073. ///          returned vector. \n
  1074. ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
  1075. ///          returned vector. \n
  1076. ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
  1077. ///          returned vector. \n
  1078. ///    Bits [5:4]: \n
  1079. ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
  1080. ///          returned vector. \n
  1081. ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
  1082. ///          returned vector. \n
  1083. ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
  1084. ///          returned vector. \n
  1085. ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
  1086. ///          returned vector. \n
  1087. ///    Bits [7:6]: \n
  1088. ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
  1089. ///          returned vector. \n
  1090. ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
  1091. ///          returned vector. \n
  1092. ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
  1093. ///          returned vector. \n
  1094. ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
  1095. ///          returned vector.
  1096. /// \returns A 128-bit vector of [4 x float] containing the copied values.
  1097. #define _mm_permute_ps(A, C) \
  1098.   ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
  1099.  
  1100. /// Copies the values in a 256-bit vector of [8 x float] as specified by
  1101. ///    the immediate integer operand.
  1102. ///
  1103. /// \headerfile <x86intrin.h>
  1104. ///
  1105. /// \code
  1106. /// __m256 _mm256_permute_ps(__m256 A, const int C);
  1107. /// \endcode
  1108. ///
  1109. /// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
  1110. ///
  1111. /// \param A
  1112. ///    A 256-bit vector of [8 x float].
  1113. /// \param C
  1114. ///    An immediate integer operand specifying how the values are to be
  1115. ///    copied. \n
  1116. ///    Bits [1:0]: \n
  1117. ///      00: Bits [31:0] of the source are copied to bits [31:0] of the
  1118. ///          returned vector. \n
  1119. ///      01: Bits [63:32] of the source are copied to bits [31:0] of the
  1120. ///          returned vector. \n
  1121. ///      10: Bits [95:64] of the source are copied to bits [31:0] of the
  1122. ///          returned vector. \n
  1123. ///      11: Bits [127:96] of the source are copied to bits [31:0] of the
  1124. ///          returned vector. \n
  1125. ///    Bits [3:2]: \n
  1126. ///      00: Bits [31:0] of the source are copied to bits [63:32] of the
  1127. ///          returned vector. \n
  1128. ///      01: Bits [63:32] of the source are copied to bits [63:32] of the
  1129. ///          returned vector. \n
  1130. ///      10: Bits [95:64] of the source are copied to bits [63:32] of the
  1131. ///          returned vector. \n
  1132. ///      11: Bits [127:96] of the source are copied to bits [63:32] of the
  1133. ///          returned vector. \n
  1134. ///    Bits [5:4]: \n
  1135. ///      00: Bits [31:0] of the source are copied to bits [95:64] of the
  1136. ///          returned vector. \n
  1137. ///      01: Bits [63:32] of the source are copied to bits [95:64] of the
  1138. ///          returned vector. \n
  1139. ///      10: Bits [95:64] of the source are copied to bits [95:64] of the
  1140. ///          returned vector. \n
  1141. ///      11: Bits [127:96] of the source are copied to bits [95:64] of the
  1142. ///          returned vector. \n
  1143. ///    Bits [7:6]: \n
  1144. ///      00: Bits [31:0] of the source are copied to bits [127:96] of the
  1145. ///          returned vector. \n
  1146. ///      01: Bits [63:32] of the source are copied to bits [127:96] of the
  1147. ///          returned vector. \n
  1148. ///      10: Bits [95:64] of the source are copied to bits [127:96] of the
  1149. ///          returned vector. \n
  1150. ///      11: Bits [127:96] of the source are copied to bits [127:96] of the
  1151. ///          returned vector. \n
  1152. ///    Bits [1:0]: \n
  1153. ///      00: Bits [159:128] of the source are copied to bits [159:128] of the
  1154. ///          returned vector. \n
  1155. ///      01: Bits [191:160] of the source are copied to bits [159:128] of the
  1156. ///          returned vector. \n
  1157. ///      10: Bits [223:192] of the source are copied to bits [159:128] of the
  1158. ///          returned vector. \n
  1159. ///      11: Bits [255:224] of the source are copied to bits [159:128] of the
  1160. ///          returned vector. \n
  1161. ///    Bits [3:2]: \n
  1162. ///      00: Bits [159:128] of the source are copied to bits [191:160] of the
  1163. ///          returned vector. \n
  1164. ///      01: Bits [191:160] of the source are copied to bits [191:160] of the
  1165. ///          returned vector. \n
  1166. ///      10: Bits [223:192] of the source are copied to bits [191:160] of the
  1167. ///          returned vector. \n
  1168. ///      11: Bits [255:224] of the source are copied to bits [191:160] of the
  1169. ///          returned vector. \n
  1170. ///    Bits [5:4]: \n
  1171. ///      00: Bits [159:128] of the source are copied to bits [223:192] of the
  1172. ///          returned vector. \n
  1173. ///      01: Bits [191:160] of the source are copied to bits [223:192] of the
  1174. ///          returned vector. \n
  1175. ///      10: Bits [223:192] of the source are copied to bits [223:192] of the
  1176. ///          returned vector. \n
  1177. ///      11: Bits [255:224] of the source are copied to bits [223:192] of the
  1178. ///          returned vector. \n
  1179. ///    Bits [7:6]: \n
  1180. ///      00: Bits [159:128] of the source are copied to bits [255:224] of the
  1181. ///          returned vector. \n
  1182. ///      01: Bits [191:160] of the source are copied to bits [255:224] of the
  1183. ///          returned vector. \n
  1184. ///      10: Bits [223:192] of the source are copied to bits [255:224] of the
  1185. ///          returned vector. \n
  1186. ///      11: Bits [255:224] of the source are copied to bits [255:224] of the
  1187. ///          returned vector.
  1188. /// \returns A 256-bit vector of [8 x float] containing the copied values.
  1189. #define _mm256_permute_ps(A, C) \
  1190.   ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
  1191.  
  1192. /// Permutes 128-bit data values stored in two 256-bit vectors of
  1193. ///    [4 x double], as specified by the immediate integer operand.
  1194. ///
  1195. /// \headerfile <x86intrin.h>
  1196. ///
  1197. /// \code
  1198. /// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
  1199. /// \endcode
  1200. ///
  1201. /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
  1202. ///
  1203. /// \param V1
  1204. ///    A 256-bit vector of [4 x double].
  1205. /// \param V2
  1206. ///    A 256-bit vector of [4 x double.
  1207. /// \param M
  1208. ///    An immediate integer operand specifying how the values are to be
  1209. ///    permuted. \n
  1210. ///    Bits [1:0]: \n
  1211. ///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
  1212. ///          destination. \n
  1213. ///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
  1214. ///          destination. \n
  1215. ///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
  1216. ///          destination. \n
  1217. ///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
  1218. ///          destination. \n
  1219. ///    Bits [5:4]: \n
  1220. ///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
  1221. ///          destination. \n
  1222. ///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
  1223. ///          destination. \n
  1224. ///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
  1225. ///          destination. \n
  1226. ///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
  1227. ///          destination.
  1228. /// \returns A 256-bit vector of [4 x double] containing the copied values.
  1229. #define _mm256_permute2f128_pd(V1, V2, M) \
  1230.   ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
  1231.                                             (__v4df)(__m256d)(V2), (int)(M)))
  1232.  
  1233. /// Permutes 128-bit data values stored in two 256-bit vectors of
  1234. ///    [8 x float], as specified by the immediate integer operand.
  1235. ///
  1236. /// \headerfile <x86intrin.h>
  1237. ///
  1238. /// \code
  1239. /// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
  1240. /// \endcode
  1241. ///
  1242. /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
  1243. ///
  1244. /// \param V1
  1245. ///    A 256-bit vector of [8 x float].
  1246. /// \param V2
  1247. ///    A 256-bit vector of [8 x float].
  1248. /// \param M
  1249. ///    An immediate integer operand specifying how the values are to be
  1250. ///    permuted. \n
  1251. ///    Bits [1:0]: \n
  1252. ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
  1253. ///    destination. \n
  1254. ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
  1255. ///    destination. \n
  1256. ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
  1257. ///    destination. \n
  1258. ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
  1259. ///    destination. \n
  1260. ///    Bits [5:4]: \n
  1261. ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
  1262. ///    destination. \n
  1263. ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
  1264. ///    destination. \n
  1265. ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
  1266. ///    destination. \n
  1267. ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
  1268. ///    destination.
  1269. /// \returns A 256-bit vector of [8 x float] containing the copied values.
  1270. #define _mm256_permute2f128_ps(V1, V2, M) \
  1271.   ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
  1272.                                            (__v8sf)(__m256)(V2), (int)(M)))
  1273.  
  1274. /// Permutes 128-bit data values stored in two 256-bit integer vectors,
  1275. ///    as specified by the immediate integer operand.
  1276. ///
  1277. /// \headerfile <x86intrin.h>
  1278. ///
  1279. /// \code
  1280. /// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
  1281. /// \endcode
  1282. ///
  1283. /// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
  1284. ///
  1285. /// \param V1
  1286. ///    A 256-bit integer vector.
  1287. /// \param V2
  1288. ///    A 256-bit integer vector.
  1289. /// \param M
  1290. ///    An immediate integer operand specifying how the values are to be copied.
  1291. ///    Bits [1:0]: \n
  1292. ///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
  1293. ///    destination. \n
  1294. ///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
  1295. ///    destination. \n
  1296. ///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
  1297. ///    destination. \n
  1298. ///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
  1299. ///    destination. \n
  1300. ///    Bits [5:4]: \n
  1301. ///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
  1302. ///    destination. \n
  1303. ///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
  1304. ///    destination. \n
  1305. ///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
  1306. ///    destination. \n
  1307. ///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
  1308. ///    destination.
  1309. /// \returns A 256-bit integer vector containing the copied values.
  1310. #define _mm256_permute2f128_si256(V1, V2, M) \
  1311.   ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
  1312.                                             (__v8si)(__m256i)(V2), (int)(M)))
  1313.  
  1314. /* Vector Blend */
  1315. /// Merges 64-bit double-precision data values stored in either of the
  1316. ///    two 256-bit vectors of [4 x double], as specified by the immediate
  1317. ///    integer operand.
  1318. ///
  1319. /// \headerfile <x86intrin.h>
  1320. ///
  1321. /// \code
  1322. /// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
  1323. /// \endcode
  1324. ///
  1325. /// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
  1326. ///
  1327. /// \param V1
  1328. ///    A 256-bit vector of [4 x double].
  1329. /// \param V2
  1330. ///    A 256-bit vector of [4 x double].
  1331. /// \param M
  1332. ///    An immediate integer operand, with mask bits [3:0] specifying how the
  1333. ///    values are to be copied. The position of the mask bit corresponds to the
  1334. ///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
  1335. ///    element in operand \a V1 is copied to the same position in the
  1336. ///    destination. When a mask bit is 1, the corresponding 64-bit element in
  1337. ///    operand \a V2 is copied to the same position in the destination.
  1338. /// \returns A 256-bit vector of [4 x double] containing the copied values.
  1339. #define _mm256_blend_pd(V1, V2, M) \
  1340.   ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
  1341.                                       (__v4df)(__m256d)(V2), (int)(M)))
  1342.  
  1343. /// Merges 32-bit single-precision data values stored in either of the
  1344. ///    two 256-bit vectors of [8 x float], as specified by the immediate
  1345. ///    integer operand.
  1346. ///
  1347. /// \headerfile <x86intrin.h>
  1348. ///
  1349. /// \code
  1350. /// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
  1351. /// \endcode
  1352. ///
  1353. /// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
  1354. ///
  1355. /// \param V1
  1356. ///    A 256-bit vector of [8 x float].
  1357. /// \param V2
  1358. ///    A 256-bit vector of [8 x float].
  1359. /// \param M
  1360. ///    An immediate integer operand, with mask bits [7:0] specifying how the
  1361. ///    values are to be copied. The position of the mask bit corresponds to the
  1362. ///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
  1363. ///    element in operand \a V1 is copied to the same position in the
  1364. ///    destination. When a mask bit is 1, the corresponding 32-bit element in
  1365. ///    operand \a V2 is copied to the same position in the destination.
  1366. /// \returns A 256-bit vector of [8 x float] containing the copied values.
  1367. #define _mm256_blend_ps(V1, V2, M) \
  1368.   ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
  1369.                                      (__v8sf)(__m256)(V2), (int)(M)))
  1370.  
  1371. /// Merges 64-bit double-precision data values stored in either of the
  1372. ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
  1373. ///    operand.
  1374. ///
  1375. /// \headerfile <x86intrin.h>
  1376. ///
  1377. /// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
  1378. ///
  1379. /// \param __a
  1380. ///    A 256-bit vector of [4 x double].
  1381. /// \param __b
  1382. ///    A 256-bit vector of [4 x double].
  1383. /// \param __c
  1384. ///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
  1385. ///    how the values are to be copied. The position of the mask bit corresponds
  1386. ///    to the most significant bit of a copied value. When a mask bit is 0, the
  1387. ///    corresponding 64-bit element in operand \a __a is copied to the same
  1388. ///    position in the destination. When a mask bit is 1, the corresponding
  1389. ///    64-bit element in operand \a __b is copied to the same position in the
  1390. ///    destination.
  1391. /// \returns A 256-bit vector of [4 x double] containing the copied values.
  1392. static __inline __m256d __DEFAULT_FN_ATTRS
  1393. _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
  1394. {
  1395.   return (__m256d)__builtin_ia32_blendvpd256(
  1396.     (__v4df)__a, (__v4df)__b, (__v4df)__c);
  1397. }
  1398.  
  1399. /// Merges 32-bit single-precision data values stored in either of the
  1400. ///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
  1401. ///    operand.
  1402. ///
  1403. /// \headerfile <x86intrin.h>
  1404. ///
  1405. /// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
  1406. ///
  1407. /// \param __a
  1408. ///    A 256-bit vector of [8 x float].
  1409. /// \param __b
  1410. ///    A 256-bit vector of [8 x float].
  1411. /// \param __c
  1412. ///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
  1413. ///    and 31 specifying how the values are to be copied. The position of the
  1414. ///    mask bit corresponds to the most significant bit of a copied value. When
  1415. ///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
  1416. ///    copied to the same position in the destination. When a mask bit is 1, the
  1417. ///    corresponding 32-bit element in operand \a __b is copied to the same
  1418. ///    position in the destination.
  1419. /// \returns A 256-bit vector of [8 x float] containing the copied values.
  1420. static __inline __m256 __DEFAULT_FN_ATTRS
  1421. _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
  1422. {
  1423.   return (__m256)__builtin_ia32_blendvps256(
  1424.     (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
  1425. }
  1426.  
  1427. /* Vector Dot Product */
  1428. /// Computes two dot products in parallel, using the lower and upper
  1429. ///    halves of two [8 x float] vectors as input to the two computations, and
  1430. ///    returning the two dot products in the lower and upper halves of the
  1431. ///    [8 x float] result.
  1432. ///
  1433. ///    The immediate integer operand controls which input elements will
  1434. ///    contribute to the dot product, and where the final results are returned.
  1435. ///    In general, for each dot product, the four corresponding elements of the
  1436. ///    input vectors are multiplied; the first two and second two products are
  1437. ///    summed, then the two sums are added to form the final result.
  1438. ///
  1439. /// \headerfile <x86intrin.h>
  1440. ///
  1441. /// \code
  1442. /// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
  1443. /// \endcode
  1444. ///
  1445. /// This intrinsic corresponds to the <c> VDPPS </c> instruction.
  1446. ///
  1447. /// \param V1
  1448. ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
  1449. /// \param V2
  1450. ///    A vector of [8 x float] values, treated as two [4 x float] vectors.
  1451. /// \param M
  1452. ///    An immediate integer argument. Bits [7:4] determine which elements of
  1453. ///    the input vectors are used, with bit [4] corresponding to the lowest
  1454. ///    element and bit [7] corresponding to the highest element of each [4 x
  1455. ///    float] subvector. If a bit is set, the corresponding elements from the
  1456. ///    two input vectors are used as an input for dot product; otherwise that
  1457. ///    input is treated as zero. Bits [3:0] determine which elements of the
  1458. ///    result will receive a copy of the final dot product, with bit [0]
  1459. ///    corresponding to the lowest element and bit [3] corresponding to the
  1460. ///    highest element of each [4 x float] subvector. If a bit is set, the dot
  1461. ///    product is returned in the corresponding element; otherwise that element
  1462. ///    is set to zero. The bitmask is applied in the same way to each of the
  1463. ///    two parallel dot product computations.
  1464. /// \returns A 256-bit vector of [8 x float] containing the two dot products.
  1465. #define _mm256_dp_ps(V1, V2, M) \
  1466.   ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
  1467.                                   (__v8sf)(__m256)(V2), (M)))
  1468.  
  1469. /* Vector shuffle */
  1470. /// Selects 8 float values from the 256-bit operands of [8 x float], as
  1471. ///    specified by the immediate value operand.
  1472. ///
  1473. ///    The four selected elements in each operand are copied to the destination
  1474. ///    according to the bits specified in the immediate operand. The selected
  1475. ///    elements from the first 256-bit operand are copied to bits [63:0] and
  1476. ///    bits [191:128] of the destination, and the selected elements from the
  1477. ///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
  1478. ///    the destination. For example, if bits [7:0] of the immediate operand
  1479. ///    contain a value of 0xFF, the 256-bit destination vector would contain the
  1480. ///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
  1481. ///
  1482. /// \headerfile <x86intrin.h>
  1483. ///
  1484. /// \code
  1485. /// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
  1486. /// \endcode
  1487. ///
  1488. /// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
  1489. ///
  1490. /// \param a
  1491. ///    A 256-bit vector of [8 x float]. The four selected elements in this
  1492. ///    operand are copied to bits [63:0] and bits [191:128] in the destination,
  1493. ///    according to the bits specified in the immediate operand.
  1494. /// \param b
  1495. ///    A 256-bit vector of [8 x float]. The four selected elements in this
  1496. ///    operand are copied to bits [127:64] and bits [255:192] in the
  1497. ///    destination, according to the bits specified in the immediate operand.
  1498. /// \param mask
  1499. ///    An immediate value containing an 8-bit value specifying which elements to
  1500. ///    copy from \a a and \a b \n.
  1501. ///    Bits [3:0] specify the values copied from operand \a a. \n
  1502. ///    Bits [7:4] specify the values copied from operand \a b. \n
  1503. ///    The destinations within the 256-bit destination are assigned values as
  1504. ///    follows, according to the bit value assignments described below: \n
  1505. ///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
  1506. ///    destination. \n
  1507. ///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
  1508. ///    destination. \n
  1509. ///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
  1510. ///    destination. \n
  1511. ///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
  1512. ///    the destination. \n
  1513. ///    Bit value assignments: \n
  1514. ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
  1515. ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
  1516. ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
  1517. ///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
  1518. ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
  1519. ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
  1520. ///    <c>[b6, b4, b2, b0]</c>.
  1521. /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
  1522. #define _mm256_shuffle_ps(a, b, mask) \
  1523.   ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
  1524.                                     (__v8sf)(__m256)(b), (int)(mask)))
  1525.  
  1526. /// Selects four double-precision values from the 256-bit operands of
  1527. ///    [4 x double], as specified by the immediate value operand.
  1528. ///
  1529. ///    The selected elements from the first 256-bit operand are copied to bits
  1530. ///    [63:0] and bits [191:128] in the destination, and the selected elements
  1531. ///    from the second 256-bit operand are copied to bits [127:64] and bits
  1532. ///    [255:192] in the destination. For example, if bits [3:0] of the immediate
  1533. ///    operand contain a value of 0xF, the 256-bit destination vector would
  1534. ///    contain the following values: b[3], a[3], b[1], a[1].
  1535. ///
  1536. /// \headerfile <x86intrin.h>
  1537. ///
  1538. /// \code
  1539. /// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
  1540. /// \endcode
  1541. ///
  1542. /// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
  1543. ///
  1544. /// \param a
  1545. ///    A 256-bit vector of [4 x double].
  1546. /// \param b
  1547. ///    A 256-bit vector of [4 x double].
  1548. /// \param mask
  1549. ///    An immediate value containing 8-bit values specifying which elements to
  1550. ///    copy from \a a and \a b: \n
  1551. ///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
  1552. ///    destination. \n
  1553. ///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
  1554. ///    destination. \n
  1555. ///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
  1556. ///    destination. \n
  1557. ///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
  1558. ///    destination. \n
  1559. ///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
  1560. ///    destination. \n
  1561. ///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
  1562. ///    destination. \n
  1563. ///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
  1564. ///    destination. \n
  1565. ///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
  1566. ///    destination.
  1567. /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
  1568. #define _mm256_shuffle_pd(a, b, mask) \
  1569.   ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
  1570.                                      (__v4df)(__m256d)(b), (int)(mask)))
  1571.  
  1572. /* Compare */
  1573. #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
  1574. #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
  1575. #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
  1576. #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
  1577. #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
  1578. #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
  1579. #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
  1580. #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
  1581. #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
  1582. #define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
  1583. #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
  1584. #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
  1585. #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
  1586. #define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
  1587. #define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
  1588. #define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
  1589. #define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
  1590. #define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
  1591. #define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
  1592. #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
  1593. #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
  1594. #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
  1595. #define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
  1596. #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
  1597. #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
  1598. #define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
  1599. #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
  1600. #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
  1601. #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
  1602. #define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
  1603. #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
  1604. #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
  1605.  
  1606. /// Compares each of the corresponding double-precision values of two
  1607. ///    128-bit vectors of [2 x double], using the operation specified by the
  1608. ///    immediate integer operand.
  1609. ///
  1610. ///    Returns a [2 x double] vector consisting of two doubles corresponding to
  1611. ///    the two comparison results: zero if the comparison is false, and all 1's
  1612. ///    if the comparison is true.
  1613. ///
  1614. /// \headerfile <x86intrin.h>
  1615. ///
  1616. /// \code
  1617. /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
  1618. /// \endcode
  1619. ///
  1620. /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
  1621. ///
  1622. /// \param a
  1623. ///    A 128-bit vector of [2 x double].
  1624. /// \param b
  1625. ///    A 128-bit vector of [2 x double].
  1626. /// \param c
  1627. ///    An immediate integer operand, with bits [4:0] specifying which comparison
  1628. ///    operation to use: \n
  1629. ///    0x00: Equal (ordered, non-signaling) \n
  1630. ///    0x01: Less-than (ordered, signaling) \n
  1631. ///    0x02: Less-than-or-equal (ordered, signaling) \n
  1632. ///    0x03: Unordered (non-signaling) \n
  1633. ///    0x04: Not-equal (unordered, non-signaling) \n
  1634. ///    0x05: Not-less-than (unordered, signaling) \n
  1635. ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
  1636. ///    0x07: Ordered (non-signaling) \n
  1637. ///    0x08: Equal (unordered, non-signaling) \n
  1638. ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
  1639. ///    0x0A: Not-greater-than (unordered, signaling) \n
  1640. ///    0x0B: False (ordered, non-signaling) \n
  1641. ///    0x0C: Not-equal (ordered, non-signaling) \n
  1642. ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
  1643. ///    0x0E: Greater-than (ordered, signaling) \n
  1644. ///    0x0F: True (unordered, non-signaling) \n
  1645. ///    0x10: Equal (ordered, signaling) \n
  1646. ///    0x11: Less-than (ordered, non-signaling) \n
  1647. ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
  1648. ///    0x13: Unordered (signaling) \n
  1649. ///    0x14: Not-equal (unordered, signaling) \n
  1650. ///    0x15: Not-less-than (unordered, non-signaling) \n
  1651. ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
  1652. ///    0x17: Ordered (signaling) \n
  1653. ///    0x18: Equal (unordered, signaling) \n
  1654. ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
  1655. ///    0x1A: Not-greater-than (unordered, non-signaling) \n
  1656. ///    0x1B: False (ordered, signaling) \n
  1657. ///    0x1C: Not-equal (ordered, signaling) \n
  1658. ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
  1659. ///    0x1E: Greater-than (ordered, non-signaling) \n
  1660. ///    0x1F: True (unordered, signaling)
  1661. /// \returns A 128-bit vector of [2 x double] containing the comparison results.
  1662. #define _mm_cmp_pd(a, b, c) \
  1663.   ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
  1664.                                  (__v2df)(__m128d)(b), (c)))
  1665.  
  1666. /// Compares each of the corresponding values of two 128-bit vectors of
  1667. ///    [4 x float], using the operation specified by the immediate integer
  1668. ///    operand.
  1669. ///
  1670. ///    Returns a [4 x float] vector consisting of four floats corresponding to
  1671. ///    the four comparison results: zero if the comparison is false, and all 1's
  1672. ///    if the comparison is true.
  1673. ///
  1674. /// \headerfile <x86intrin.h>
  1675. ///
  1676. /// \code
  1677. /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
  1678. /// \endcode
  1679. ///
  1680. /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
  1681. ///
  1682. /// \param a
  1683. ///    A 128-bit vector of [4 x float].
  1684. /// \param b
  1685. ///    A 128-bit vector of [4 x float].
  1686. /// \param c
  1687. ///    An immediate integer operand, with bits [4:0] specifying which comparison
  1688. ///    operation to use: \n
  1689. ///    0x00: Equal (ordered, non-signaling) \n
  1690. ///    0x01: Less-than (ordered, signaling) \n
  1691. ///    0x02: Less-than-or-equal (ordered, signaling) \n
  1692. ///    0x03: Unordered (non-signaling) \n
  1693. ///    0x04: Not-equal (unordered, non-signaling) \n
  1694. ///    0x05: Not-less-than (unordered, signaling) \n
  1695. ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
  1696. ///    0x07: Ordered (non-signaling) \n
  1697. ///    0x08: Equal (unordered, non-signaling) \n
  1698. ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
  1699. ///    0x0A: Not-greater-than (unordered, signaling) \n
  1700. ///    0x0B: False (ordered, non-signaling) \n
  1701. ///    0x0C: Not-equal (ordered, non-signaling) \n
  1702. ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
  1703. ///    0x0E: Greater-than (ordered, signaling) \n
  1704. ///    0x0F: True (unordered, non-signaling) \n
  1705. ///    0x10: Equal (ordered, signaling) \n
  1706. ///    0x11: Less-than (ordered, non-signaling) \n
  1707. ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
  1708. ///    0x13: Unordered (signaling) \n
  1709. ///    0x14: Not-equal (unordered, signaling) \n
  1710. ///    0x15: Not-less-than (unordered, non-signaling) \n
  1711. ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
  1712. ///    0x17: Ordered (signaling) \n
  1713. ///    0x18: Equal (unordered, signaling) \n
  1714. ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
  1715. ///    0x1A: Not-greater-than (unordered, non-signaling) \n
  1716. ///    0x1B: False (ordered, signaling) \n
  1717. ///    0x1C: Not-equal (ordered, signaling) \n
  1718. ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
  1719. ///    0x1E: Greater-than (ordered, non-signaling) \n
  1720. ///    0x1F: True (unordered, signaling)
  1721. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  1722. #define _mm_cmp_ps(a, b, c) \
  1723.   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
  1724.                                 (__v4sf)(__m128)(b), (c)))
  1725.  
  1726. /// Compares each of the corresponding double-precision values of two
  1727. ///    256-bit vectors of [4 x double], using the operation specified by the
  1728. ///    immediate integer operand.
  1729. ///
  1730. ///    Returns a [4 x double] vector consisting of four doubles corresponding to
  1731. ///    the four comparison results: zero if the comparison is false, and all 1's
  1732. ///    if the comparison is true.
  1733. ///
  1734. /// \headerfile <x86intrin.h>
  1735. ///
  1736. /// \code
  1737. /// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
  1738. /// \endcode
  1739. ///
  1740. /// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
  1741. ///
  1742. /// \param a
  1743. ///    A 256-bit vector of [4 x double].
  1744. /// \param b
  1745. ///    A 256-bit vector of [4 x double].
  1746. /// \param c
  1747. ///    An immediate integer operand, with bits [4:0] specifying which comparison
  1748. ///    operation to use: \n
  1749. ///    0x00: Equal (ordered, non-signaling) \n
  1750. ///    0x01: Less-than (ordered, signaling) \n
  1751. ///    0x02: Less-than-or-equal (ordered, signaling) \n
  1752. ///    0x03: Unordered (non-signaling) \n
  1753. ///    0x04: Not-equal (unordered, non-signaling) \n
  1754. ///    0x05: Not-less-than (unordered, signaling) \n
  1755. ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
  1756. ///    0x07: Ordered (non-signaling) \n
  1757. ///    0x08: Equal (unordered, non-signaling) \n
  1758. ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
  1759. ///    0x0A: Not-greater-than (unordered, signaling) \n
  1760. ///    0x0B: False (ordered, non-signaling) \n
  1761. ///    0x0C: Not-equal (ordered, non-signaling) \n
  1762. ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
  1763. ///    0x0E: Greater-than (ordered, signaling) \n
  1764. ///    0x0F: True (unordered, non-signaling) \n
  1765. ///    0x10: Equal (ordered, signaling) \n
  1766. ///    0x11: Less-than (ordered, non-signaling) \n
  1767. ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
  1768. ///    0x13: Unordered (signaling) \n
  1769. ///    0x14: Not-equal (unordered, signaling) \n
  1770. ///    0x15: Not-less-than (unordered, non-signaling) \n
  1771. ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
  1772. ///    0x17: Ordered (signaling) \n
  1773. ///    0x18: Equal (unordered, signaling) \n
  1774. ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
  1775. ///    0x1A: Not-greater-than (unordered, non-signaling) \n
  1776. ///    0x1B: False (ordered, signaling) \n
  1777. ///    0x1C: Not-equal (ordered, signaling) \n
  1778. ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
  1779. ///    0x1E: Greater-than (ordered, non-signaling) \n
  1780. ///    0x1F: True (unordered, signaling)
  1781. /// \returns A 256-bit vector of [4 x double] containing the comparison results.
  1782. #define _mm256_cmp_pd(a, b, c) \
  1783.   ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
  1784.                                     (__v4df)(__m256d)(b), (c)))
  1785.  
  1786. /// Compares each of the corresponding values of two 256-bit vectors of
  1787. ///    [8 x float], using the operation specified by the immediate integer
  1788. ///    operand.
  1789. ///
  1790. ///    Returns a [8 x float] vector consisting of eight floats corresponding to
  1791. ///    the eight comparison results: zero if the comparison is false, and all
  1792. ///    1's if the comparison is true.
  1793. ///
  1794. /// \headerfile <x86intrin.h>
  1795. ///
  1796. /// \code
  1797. /// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
  1798. /// \endcode
  1799. ///
  1800. /// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
  1801. ///
  1802. /// \param a
  1803. ///    A 256-bit vector of [8 x float].
  1804. /// \param b
  1805. ///    A 256-bit vector of [8 x float].
  1806. /// \param c
  1807. ///    An immediate integer operand, with bits [4:0] specifying which comparison
  1808. ///    operation to use: \n
  1809. ///    0x00: Equal (ordered, non-signaling) \n
  1810. ///    0x01: Less-than (ordered, signaling) \n
  1811. ///    0x02: Less-than-or-equal (ordered, signaling) \n
  1812. ///    0x03: Unordered (non-signaling) \n
  1813. ///    0x04: Not-equal (unordered, non-signaling) \n
  1814. ///    0x05: Not-less-than (unordered, signaling) \n
  1815. ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
  1816. ///    0x07: Ordered (non-signaling) \n
  1817. ///    0x08: Equal (unordered, non-signaling) \n
  1818. ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
  1819. ///    0x0A: Not-greater-than (unordered, signaling) \n
  1820. ///    0x0B: False (ordered, non-signaling) \n
  1821. ///    0x0C: Not-equal (ordered, non-signaling) \n
  1822. ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
  1823. ///    0x0E: Greater-than (ordered, signaling) \n
  1824. ///    0x0F: True (unordered, non-signaling) \n
  1825. ///    0x10: Equal (ordered, signaling) \n
  1826. ///    0x11: Less-than (ordered, non-signaling) \n
  1827. ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
  1828. ///    0x13: Unordered (signaling) \n
  1829. ///    0x14: Not-equal (unordered, signaling) \n
  1830. ///    0x15: Not-less-than (unordered, non-signaling) \n
  1831. ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
  1832. ///    0x17: Ordered (signaling) \n
  1833. ///    0x18: Equal (unordered, signaling) \n
  1834. ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
  1835. ///    0x1A: Not-greater-than (unordered, non-signaling) \n
  1836. ///    0x1B: False (ordered, signaling) \n
  1837. ///    0x1C: Not-equal (ordered, signaling) \n
  1838. ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
  1839. ///    0x1E: Greater-than (ordered, non-signaling) \n
  1840. ///    0x1F: True (unordered, signaling)
  1841. /// \returns A 256-bit vector of [8 x float] containing the comparison results.
  1842. #define _mm256_cmp_ps(a, b, c) \
  1843.   ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
  1844.                                    (__v8sf)(__m256)(b), (c)))
  1845.  
  1846. /// Compares each of the corresponding scalar double-precision values of
  1847. ///    two 128-bit vectors of [2 x double], using the operation specified by the
  1848. ///    immediate integer operand.
  1849. ///
  1850. ///    If the result is true, all 64 bits of the destination vector are set;
  1851. ///    otherwise they are cleared.
  1852. ///
  1853. /// \headerfile <x86intrin.h>
  1854. ///
  1855. /// \code
  1856. /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
  1857. /// \endcode
  1858. ///
  1859. /// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
  1860. ///
  1861. /// \param a
  1862. ///    A 128-bit vector of [2 x double].
  1863. /// \param b
  1864. ///    A 128-bit vector of [2 x double].
  1865. /// \param c
  1866. ///    An immediate integer operand, with bits [4:0] specifying which comparison
  1867. ///    operation to use: \n
  1868. ///    0x00: Equal (ordered, non-signaling) \n
  1869. ///    0x01: Less-than (ordered, signaling) \n
  1870. ///    0x02: Less-than-or-equal (ordered, signaling) \n
  1871. ///    0x03: Unordered (non-signaling) \n
  1872. ///    0x04: Not-equal (unordered, non-signaling) \n
  1873. ///    0x05: Not-less-than (unordered, signaling) \n
  1874. ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
  1875. ///    0x07: Ordered (non-signaling) \n
  1876. ///    0x08: Equal (unordered, non-signaling) \n
  1877. ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
  1878. ///    0x0A: Not-greater-than (unordered, signaling) \n
  1879. ///    0x0B: False (ordered, non-signaling) \n
  1880. ///    0x0C: Not-equal (ordered, non-signaling) \n
  1881. ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
  1882. ///    0x0E: Greater-than (ordered, signaling) \n
  1883. ///    0x0F: True (unordered, non-signaling) \n
  1884. ///    0x10: Equal (ordered, signaling) \n
  1885. ///    0x11: Less-than (ordered, non-signaling) \n
  1886. ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
  1887. ///    0x13: Unordered (signaling) \n
  1888. ///    0x14: Not-equal (unordered, signaling) \n
  1889. ///    0x15: Not-less-than (unordered, non-signaling) \n
  1890. ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
  1891. ///    0x17: Ordered (signaling) \n
  1892. ///    0x18: Equal (unordered, signaling) \n
  1893. ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
  1894. ///    0x1A: Not-greater-than (unordered, non-signaling) \n
  1895. ///    0x1B: False (ordered, signaling) \n
  1896. ///    0x1C: Not-equal (ordered, signaling) \n
  1897. ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
  1898. ///    0x1E: Greater-than (ordered, non-signaling) \n
  1899. ///    0x1F: True (unordered, signaling)
  1900. /// \returns A 128-bit vector of [2 x double] containing the comparison results.
  1901. #define _mm_cmp_sd(a, b, c) \
  1902.   ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
  1903.                                  (__v2df)(__m128d)(b), (c)))
  1904.  
  1905. /// Compares each of the corresponding scalar values of two 128-bit
  1906. ///    vectors of [4 x float], using the operation specified by the immediate
  1907. ///    integer operand.
  1908. ///
  1909. ///    If the result is true, all 32 bits of the destination vector are set;
  1910. ///    otherwise they are cleared.
  1911. ///
  1912. /// \headerfile <x86intrin.h>
  1913. ///
  1914. /// \code
  1915. /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
  1916. /// \endcode
  1917. ///
  1918. /// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
  1919. ///
  1920. /// \param a
  1921. ///    A 128-bit vector of [4 x float].
  1922. /// \param b
  1923. ///    A 128-bit vector of [4 x float].
  1924. /// \param c
  1925. ///    An immediate integer operand, with bits [4:0] specifying which comparison
  1926. ///    operation to use: \n
  1927. ///    0x00: Equal (ordered, non-signaling) \n
  1928. ///    0x01: Less-than (ordered, signaling) \n
  1929. ///    0x02: Less-than-or-equal (ordered, signaling) \n
  1930. ///    0x03: Unordered (non-signaling) \n
  1931. ///    0x04: Not-equal (unordered, non-signaling) \n
  1932. ///    0x05: Not-less-than (unordered, signaling) \n
  1933. ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
  1934. ///    0x07: Ordered (non-signaling) \n
  1935. ///    0x08: Equal (unordered, non-signaling) \n
  1936. ///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
  1937. ///    0x0A: Not-greater-than (unordered, signaling) \n
  1938. ///    0x0B: False (ordered, non-signaling) \n
  1939. ///    0x0C: Not-equal (ordered, non-signaling) \n
  1940. ///    0x0D: Greater-than-or-equal (ordered, signaling) \n
  1941. ///    0x0E: Greater-than (ordered, signaling) \n
  1942. ///    0x0F: True (unordered, non-signaling) \n
  1943. ///    0x10: Equal (ordered, signaling) \n
  1944. ///    0x11: Less-than (ordered, non-signaling) \n
  1945. ///    0x12: Less-than-or-equal (ordered, non-signaling) \n
  1946. ///    0x13: Unordered (signaling) \n
  1947. ///    0x14: Not-equal (unordered, signaling) \n
  1948. ///    0x15: Not-less-than (unordered, non-signaling) \n
  1949. ///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
  1950. ///    0x17: Ordered (signaling) \n
  1951. ///    0x18: Equal (unordered, signaling) \n
  1952. ///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
  1953. ///    0x1A: Not-greater-than (unordered, non-signaling) \n
  1954. ///    0x1B: False (ordered, signaling) \n
  1955. ///    0x1C: Not-equal (ordered, signaling) \n
  1956. ///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
  1957. ///    0x1E: Greater-than (ordered, non-signaling) \n
  1958. ///    0x1F: True (unordered, signaling)
  1959. /// \returns A 128-bit vector of [4 x float] containing the comparison results.
  1960. #define _mm_cmp_ss(a, b, c) \
  1961.   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
  1962.                                 (__v4sf)(__m128)(b), (c)))
  1963.  
  1964. /// Takes a [8 x i32] vector and returns the vector element value
  1965. ///    indexed by the immediate constant operand.
  1966. ///
  1967. /// \headerfile <x86intrin.h>
  1968. ///
  1969. /// \code
  1970. /// int _mm256_extract_epi32(__m256i X, const int N);
  1971. /// \endcode
  1972. ///
  1973. /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
  1974. ///   instruction.
  1975. ///
  1976. /// \param X
  1977. ///    A 256-bit vector of [8 x i32].
  1978. /// \param N
  1979. ///    An immediate integer operand with bits [2:0] determining which vector
  1980. ///    element is extracted and returned.
  1981. /// \returns A 32-bit integer containing the extracted 32 bits of extended
  1982. ///    packed data.
  1983. #define _mm256_extract_epi32(X, N) \
  1984.   ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
  1985.  
  1986. /// Takes a [16 x i16] vector and returns the vector element value
  1987. ///    indexed by the immediate constant operand.
  1988. ///
  1989. /// \headerfile <x86intrin.h>
  1990. ///
  1991. /// \code
  1992. /// int _mm256_extract_epi16(__m256i X, const int N);
  1993. /// \endcode
  1994. ///
  1995. /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
  1996. ///   instruction.
  1997. ///
  1998. /// \param X
  1999. ///    A 256-bit integer vector of [16 x i16].
  2000. /// \param N
  2001. ///    An immediate integer operand with bits [3:0] determining which vector
  2002. ///    element is extracted and returned.
  2003. /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
  2004. ///    packed data.
  2005. #define _mm256_extract_epi16(X, N) \
  2006.   ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
  2007.                                                      (int)(N)))
  2008.  
  2009. /// Takes a [32 x i8] vector and returns the vector element value
  2010. ///    indexed by the immediate constant operand.
  2011. ///
  2012. /// \headerfile <x86intrin.h>
  2013. ///
  2014. /// \code
  2015. /// int _mm256_extract_epi8(__m256i X, const int N);
  2016. /// \endcode
  2017. ///
  2018. /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
  2019. ///   instruction.
  2020. ///
  2021. /// \param X
  2022. ///    A 256-bit integer vector of [32 x i8].
  2023. /// \param N
  2024. ///    An immediate integer operand with bits [4:0] determining which vector
  2025. ///    element is extracted and returned.
  2026. /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
  2027. ///    packed data.
  2028. #define _mm256_extract_epi8(X, N) \
  2029.   ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
  2030.                                                     (int)(N)))
  2031.  
  2032. #ifdef __x86_64__
  2033. /// Takes a [4 x i64] vector and returns the vector element value
  2034. ///    indexed by the immediate constant operand.
  2035. ///
  2036. /// \headerfile <x86intrin.h>
  2037. ///
  2038. /// \code
  2039. /// long long _mm256_extract_epi64(__m256i X, const int N);
  2040. /// \endcode
  2041. ///
  2042. /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
  2043. ///   instruction.
  2044. ///
  2045. /// \param X
  2046. ///    A 256-bit integer vector of [4 x i64].
  2047. /// \param N
  2048. ///    An immediate integer operand with bits [1:0] determining which vector
  2049. ///    element is extracted and returned.
  2050. /// \returns A 64-bit integer containing the extracted 64 bits of extended
  2051. ///    packed data.
  2052. #define _mm256_extract_epi64(X, N) \
  2053.   ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
  2054. #endif
  2055.  
  2056. /// Takes a [8 x i32] vector and replaces the vector element value
  2057. ///    indexed by the immediate constant operand by a new value. Returns the
  2058. ///    modified vector.
  2059. ///
  2060. /// \headerfile <x86intrin.h>
  2061. ///
  2062. /// \code
  2063. /// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
  2064. /// \endcode
  2065. ///
  2066. /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
  2067. ///   instruction.
  2068. ///
  2069. /// \param X
  2070. ///    A vector of [8 x i32] to be used by the insert operation.
  2071. /// \param I
  2072. ///    An integer value. The replacement value for the insert operation.
  2073. /// \param N
  2074. ///    An immediate integer specifying the index of the vector element to be
  2075. ///    replaced.
  2076. /// \returns A copy of vector \a X, after replacing its element indexed by
  2077. ///    \a N with \a I.
  2078. #define _mm256_insert_epi32(X, I, N) \
  2079.   ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
  2080.                                         (int)(I), (int)(N)))
  2081.  
  2082.  
  2083. /// Takes a [16 x i16] vector and replaces the vector element value
  2084. ///    indexed by the immediate constant operand with a new value. Returns the
  2085. ///    modified vector.
  2086. ///
  2087. /// \headerfile <x86intrin.h>
  2088. ///
  2089. /// \code
  2090. /// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
  2091. /// \endcode
  2092. ///
  2093. /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
  2094. ///   instruction.
  2095. ///
  2096. /// \param X
  2097. ///    A vector of [16 x i16] to be used by the insert operation.
  2098. /// \param I
  2099. ///    An i16 integer value. The replacement value for the insert operation.
  2100. /// \param N
  2101. ///    An immediate integer specifying the index of the vector element to be
  2102. ///    replaced.
  2103. /// \returns A copy of vector \a X, after replacing its element indexed by
  2104. ///    \a N with \a I.
  2105. #define _mm256_insert_epi16(X, I, N) \
  2106.   ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
  2107.                                          (int)(I), (int)(N)))
  2108.  
  2109. /// Takes a [32 x i8] vector and replaces the vector element value
  2110. ///    indexed by the immediate constant operand with a new value. Returns the
  2111. ///    modified vector.
  2112. ///
  2113. /// \headerfile <x86intrin.h>
  2114. ///
  2115. /// \code
  2116. /// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
  2117. /// \endcode
  2118. ///
  2119. /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
  2120. ///   instruction.
  2121. ///
  2122. /// \param X
  2123. ///    A vector of [32 x i8] to be used by the insert operation.
  2124. /// \param I
  2125. ///    An i8 integer value. The replacement value for the insert operation.
  2126. /// \param N
  2127. ///    An immediate integer specifying the index of the vector element to be
  2128. ///    replaced.
  2129. /// \returns A copy of vector \a X, after replacing its element indexed by
  2130. ///    \a N with \a I.
  2131. #define _mm256_insert_epi8(X, I, N) \
  2132.   ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
  2133.                                          (int)(I), (int)(N)))
  2134.  
  2135. #ifdef __x86_64__
  2136. /// Takes a [4 x i64] vector and replaces the vector element value
  2137. ///    indexed by the immediate constant operand with a new value. Returns the
  2138. ///    modified vector.
  2139. ///
  2140. /// \headerfile <x86intrin.h>
  2141. ///
  2142. /// \code
  2143. /// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
  2144. /// \endcode
  2145. ///
  2146. /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
  2147. ///   instruction.
  2148. ///
  2149. /// \param X
  2150. ///    A vector of [4 x i64] to be used by the insert operation.
  2151. /// \param I
  2152. ///    A 64-bit integer value. The replacement value for the insert operation.
  2153. /// \param N
  2154. ///    An immediate integer specifying the index of the vector element to be
  2155. ///    replaced.
  2156. /// \returns A copy of vector \a X, after replacing its element indexed by
  2157. ///     \a N with \a I.
  2158. #define _mm256_insert_epi64(X, I, N) \
  2159.   ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
  2160.                                         (long long)(I), (int)(N)))
  2161. #endif
  2162.  
  2163. /* Conversion */
  2164. /// Converts a vector of [4 x i32] into a vector of [4 x double].
  2165. ///
  2166. /// \headerfile <x86intrin.h>
  2167. ///
  2168. /// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
  2169. ///
  2170. /// \param __a
  2171. ///    A 128-bit integer vector of [4 x i32].
  2172. /// \returns A 256-bit vector of [4 x double] containing the converted values.
  2173. static __inline __m256d __DEFAULT_FN_ATTRS
  2174. _mm256_cvtepi32_pd(__m128i __a)
  2175. {
  2176.   return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
  2177. }
  2178.  
  2179. /// Converts a vector of [8 x i32] into a vector of [8 x float].
  2180. ///
  2181. /// \headerfile <x86intrin.h>
  2182. ///
  2183. /// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
  2184. ///
  2185. /// \param __a
  2186. ///    A 256-bit integer vector.
  2187. /// \returns A 256-bit vector of [8 x float] containing the converted values.
  2188. static __inline __m256 __DEFAULT_FN_ATTRS
  2189. _mm256_cvtepi32_ps(__m256i __a)
  2190. {
  2191.   return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
  2192. }
  2193.  
  2194. /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
  2195. ///    [4 x float].
  2196. ///
  2197. /// \headerfile <x86intrin.h>
  2198. ///
  2199. /// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
  2200. ///
  2201. /// \param __a
  2202. ///    A 256-bit vector of [4 x double].
  2203. /// \returns A 128-bit vector of [4 x float] containing the converted values.
  2204. static __inline __m128 __DEFAULT_FN_ATTRS
  2205. _mm256_cvtpd_ps(__m256d __a)
  2206. {
  2207.   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
  2208. }
  2209.  
  2210. /// Converts a vector of [8 x float] into a vector of [8 x i32].
  2211. ///
  2212. /// \headerfile <x86intrin.h>
  2213. ///
  2214. /// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
  2215. ///
  2216. /// \param __a
  2217. ///    A 256-bit vector of [8 x float].
  2218. /// \returns A 256-bit integer vector containing the converted values.
  2219. static __inline __m256i __DEFAULT_FN_ATTRS
  2220. _mm256_cvtps_epi32(__m256 __a)
  2221. {
  2222.   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
  2223. }
  2224.  
  2225. /// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
  2226. ///    x double].
  2227. ///
  2228. /// \headerfile <x86intrin.h>
  2229. ///
  2230. /// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
  2231. ///
  2232. /// \param __a
  2233. ///    A 128-bit vector of [4 x float].
  2234. /// \returns A 256-bit vector of [4 x double] containing the converted values.
  2235. static __inline __m256d __DEFAULT_FN_ATTRS
  2236. _mm256_cvtps_pd(__m128 __a)
  2237. {
  2238.   return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
  2239. }
  2240.  
  2241. /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
  2242. ///    x i32], truncating the result by rounding towards zero when it is
  2243. ///    inexact.
  2244. ///
  2245. /// \headerfile <x86intrin.h>
  2246. ///
  2247. /// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
  2248. ///
  2249. /// \param __a
  2250. ///    A 256-bit vector of [4 x double].
  2251. /// \returns A 128-bit integer vector containing the converted values.
  2252. static __inline __m128i __DEFAULT_FN_ATTRS
  2253. _mm256_cvttpd_epi32(__m256d __a)
  2254. {
  2255.   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
  2256. }
  2257.  
  2258. /// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
  2259. ///    x i32]. When a conversion is inexact, the value returned is rounded
  2260. ///    according to the rounding control bits in the MXCSR register.
  2261. ///
  2262. /// \headerfile <x86intrin.h>
  2263. ///
  2264. /// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
  2265. ///
  2266. /// \param __a
  2267. ///    A 256-bit vector of [4 x double].
  2268. /// \returns A 128-bit integer vector containing the converted values.
  2269. static __inline __m128i __DEFAULT_FN_ATTRS
  2270. _mm256_cvtpd_epi32(__m256d __a)
  2271. {
  2272.   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
  2273. }
  2274.  
  2275. /// Converts a vector of [8 x float] into a vector of [8 x i32],
  2276. ///    truncating the result by rounding towards zero when it is inexact.
  2277. ///
  2278. /// \headerfile <x86intrin.h>
  2279. ///
  2280. /// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
  2281. ///
  2282. /// \param __a
  2283. ///    A 256-bit vector of [8 x float].
  2284. /// \returns A 256-bit integer vector containing the converted values.
  2285. static __inline __m256i __DEFAULT_FN_ATTRS
  2286. _mm256_cvttps_epi32(__m256 __a)
  2287. {
  2288.   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
  2289. }
  2290.  
  2291. /// Returns the first element of the input vector of [4 x double].
  2292. ///
  2293. /// \headerfile <x86intrin.h>
  2294. ///
  2295. /// This intrinsic is a utility function and does not correspond to a specific
  2296. ///    instruction.
  2297. ///
  2298. /// \param __a
  2299. ///    A 256-bit vector of [4 x double].
  2300. /// \returns A 64 bit double containing the first element of the input vector.
  2301. static __inline double __DEFAULT_FN_ATTRS
  2302. _mm256_cvtsd_f64(__m256d __a)
  2303. {
  2304.  return __a[0];
  2305. }
  2306.  
  2307. /// Returns the first element of the input vector of [8 x i32].
  2308. ///
  2309. /// \headerfile <x86intrin.h>
  2310. ///
  2311. /// This intrinsic is a utility function and does not correspond to a specific
  2312. ///    instruction.
  2313. ///
  2314. /// \param __a
  2315. ///    A 256-bit vector of [8 x i32].
  2316. /// \returns A 32 bit integer containing the first element of the input vector.
  2317. static __inline int __DEFAULT_FN_ATTRS
  2318. _mm256_cvtsi256_si32(__m256i __a)
  2319. {
  2320.  __v8si __b = (__v8si)__a;
  2321.  return __b[0];
  2322. }
  2323.  
  2324. /// Returns the first element of the input vector of [8 x float].
  2325. ///
  2326. /// \headerfile <x86intrin.h>
  2327. ///
  2328. /// This intrinsic is a utility function and does not correspond to a specific
  2329. ///    instruction.
  2330. ///
  2331. /// \param __a
  2332. ///    A 256-bit vector of [8 x float].
  2333. /// \returns A 32 bit float containing the first element of the input vector.
  2334. static __inline float __DEFAULT_FN_ATTRS
  2335. _mm256_cvtss_f32(__m256 __a)
  2336. {
  2337.  return __a[0];
  2338. }
  2339.  
  2340. /* Vector replicate */
  2341. /// Moves and duplicates odd-indexed values from a 256-bit vector of
  2342. ///    [8 x float] to float values in a 256-bit vector of [8 x float].
  2343. ///
  2344. /// \headerfile <x86intrin.h>
  2345. ///
  2346. /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
  2347. ///
  2348. /// \param __a
  2349. ///    A 256-bit vector of [8 x float]. \n
  2350. ///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
  2351. ///    the return value. \n
  2352. ///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
  2353. ///    the return value. \n
  2354. ///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
  2355. ///    return value. \n
  2356. ///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
  2357. ///    return value.
  2358. /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
  2359. ///    values.
  2360. static __inline __m256 __DEFAULT_FN_ATTRS
  2361. _mm256_movehdup_ps(__m256 __a)
  2362. {
  2363.   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
  2364. }
  2365.  
  2366. /// Moves and duplicates even-indexed values from a 256-bit vector of
  2367. ///    [8 x float] to float values in a 256-bit vector of [8 x float].
  2368. ///
  2369. /// \headerfile <x86intrin.h>
  2370. ///
  2371. /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
  2372. ///
  2373. /// \param __a
  2374. ///    A 256-bit vector of [8 x float]. \n
  2375. ///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
  2376. ///    the return value. \n
  2377. ///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
  2378. ///    the return value. \n
  2379. ///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
  2380. ///    return value. \n
  2381. ///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
  2382. ///    return value.
  2383. /// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
  2384. ///    values.
  2385. static __inline __m256 __DEFAULT_FN_ATTRS
  2386. _mm256_moveldup_ps(__m256 __a)
  2387. {
  2388.   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
  2389. }
  2390.  
  2391. /// Moves and duplicates double-precision floating point values from a
  2392. ///    256-bit vector of [4 x double] to double-precision values in a 256-bit
  2393. ///    vector of [4 x double].
  2394. ///
  2395. /// \headerfile <x86intrin.h>
  2396. ///
  2397. /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
  2398. ///
  2399. /// \param __a
  2400. ///    A 256-bit vector of [4 x double]. \n
  2401. ///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
  2402. ///    return value. \n
  2403. ///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
  2404. ///    the return value.
  2405. /// \returns A 256-bit vector of [4 x double] containing the moved and
  2406. ///    duplicated values.
  2407. static __inline __m256d __DEFAULT_FN_ATTRS
  2408. _mm256_movedup_pd(__m256d __a)
  2409. {
  2410.   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
  2411. }
  2412.  
  2413. /* Unpack and Interleave */
  2414. /// Unpacks the odd-indexed vector elements from two 256-bit vectors of
  2415. ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
  2416. ///
  2417. /// \headerfile <x86intrin.h>
  2418. ///
  2419. /// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
  2420. ///
  2421. /// \param __a
  2422. ///    A 256-bit floating-point vector of [4 x double]. \n
  2423. ///    Bits [127:64] are written to bits [63:0] of the return value. \n
  2424. ///    Bits [255:192] are written to bits [191:128] of the return value. \n
  2425. /// \param __b
  2426. ///    A 256-bit floating-point vector of [4 x double]. \n
  2427. ///    Bits [127:64] are written to bits [127:64] of the return value. \n
  2428. ///    Bits [255:192] are written to bits [255:192] of the return value. \n
  2429. /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
  2430. static __inline __m256d __DEFAULT_FN_ATTRS
  2431. _mm256_unpackhi_pd(__m256d __a, __m256d __b)
  2432. {
  2433.   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
  2434. }
  2435.  
  2436. /// Unpacks the even-indexed vector elements from two 256-bit vectors of
  2437. ///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
  2438. ///
  2439. /// \headerfile <x86intrin.h>
  2440. ///
  2441. /// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
  2442. ///
  2443. /// \param __a
  2444. ///    A 256-bit floating-point vector of [4 x double]. \n
  2445. ///    Bits [63:0] are written to bits [63:0] of the return value. \n
  2446. ///    Bits [191:128] are written to bits [191:128] of the return value.
  2447. /// \param __b
  2448. ///    A 256-bit floating-point vector of [4 x double]. \n
  2449. ///    Bits [63:0] are written to bits [127:64] of the return value. \n
  2450. ///    Bits [191:128] are written to bits [255:192] of the return value. \n
  2451. /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
  2452. static __inline __m256d __DEFAULT_FN_ATTRS
  2453. _mm256_unpacklo_pd(__m256d __a, __m256d __b)
  2454. {
  2455.   return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
  2456. }
  2457.  
  2458. /// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
  2459. ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
  2460. ///    vector of [8 x float].
  2461. ///
  2462. /// \headerfile <x86intrin.h>
  2463. ///
  2464. /// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
  2465. ///
  2466. /// \param __a
  2467. ///    A 256-bit vector of [8 x float]. \n
  2468. ///    Bits [95:64] are written to bits [31:0] of the return value. \n
  2469. ///    Bits [127:96] are written to bits [95:64] of the return value. \n
  2470. ///    Bits [223:192] are written to bits [159:128] of the return value. \n
  2471. ///    Bits [255:224] are written to bits [223:192] of the return value.
  2472. /// \param __b
  2473. ///    A 256-bit vector of [8 x float]. \n
  2474. ///    Bits [95:64] are written to bits [63:32] of the return value. \n
  2475. ///    Bits [127:96] are written to bits [127:96] of the return value. \n
  2476. ///    Bits [223:192] are written to bits [191:160] of the return value. \n
  2477. ///    Bits [255:224] are written to bits [255:224] of the return value.
  2478. /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
  2479. static __inline __m256 __DEFAULT_FN_ATTRS
  2480. _mm256_unpackhi_ps(__m256 __a, __m256 __b)
  2481. {
  2482.   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
  2483. }
  2484.  
  2485. /// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
  2486. ///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
  2487. ///    vector of [8 x float].
  2488. ///
  2489. /// \headerfile <x86intrin.h>
  2490. ///
  2491. /// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
  2492. ///
  2493. /// \param __a
  2494. ///    A 256-bit vector of [8 x float]. \n
  2495. ///    Bits [31:0] are written to bits [31:0] of the return value. \n
  2496. ///    Bits [63:32] are written to bits [95:64] of the return value. \n
  2497. ///    Bits [159:128] are written to bits [159:128] of the return value. \n
  2498. ///    Bits [191:160] are written to bits [223:192] of the return value.
  2499. /// \param __b
  2500. ///    A 256-bit vector of [8 x float]. \n
  2501. ///    Bits [31:0] are written to bits [63:32] of the return value. \n
  2502. ///    Bits [63:32] are written to bits [127:96] of the return value. \n
  2503. ///    Bits [159:128] are written to bits [191:160] of the return value. \n
  2504. ///    Bits [191:160] are written to bits [255:224] of the return value.
  2505. /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
  2506. static __inline __m256 __DEFAULT_FN_ATTRS
  2507. _mm256_unpacklo_ps(__m256 __a, __m256 __b)
  2508. {
  2509.   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
  2510. }
  2511.  
  2512. /* Bit Test */
  2513. /// Given two 128-bit floating-point vectors of [2 x double], perform an
  2514. ///    element-by-element comparison of the double-precision element in the
  2515. ///    first source vector and the corresponding element in the second source
  2516. ///    vector.
  2517. ///
  2518. ///    The EFLAGS register is updated as follows: \n
  2519. ///    If there is at least one pair of double-precision elements where the
  2520. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2521. ///    ZF flag is set to 1. \n
  2522. ///    If there is at least one pair of double-precision elements where the
  2523. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2524. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2525. ///    This intrinsic returns the value of the ZF flag.
  2526. ///
  2527. /// \headerfile <x86intrin.h>
  2528. ///
  2529. /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
  2530. ///
  2531. /// \param __a
  2532. ///    A 128-bit vector of [2 x double].
  2533. /// \param __b
  2534. ///    A 128-bit vector of [2 x double].
  2535. /// \returns the ZF flag in the EFLAGS register.
  2536. static __inline int __DEFAULT_FN_ATTRS128
  2537. _mm_testz_pd(__m128d __a, __m128d __b)
  2538. {
  2539.   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
  2540. }
  2541.  
  2542. /// Given two 128-bit floating-point vectors of [2 x double], perform an
  2543. ///    element-by-element comparison of the double-precision element in the
  2544. ///    first source vector and the corresponding element in the second source
  2545. ///    vector.
  2546. ///
  2547. ///    The EFLAGS register is updated as follows: \n
  2548. ///    If there is at least one pair of double-precision elements where the
  2549. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2550. ///    ZF flag is set to 1. \n
  2551. ///    If there is at least one pair of double-precision elements where the
  2552. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2553. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2554. ///    This intrinsic returns the value of the CF flag.
  2555. ///
  2556. /// \headerfile <x86intrin.h>
  2557. ///
  2558. /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
  2559. ///
  2560. /// \param __a
  2561. ///    A 128-bit vector of [2 x double].
  2562. /// \param __b
  2563. ///    A 128-bit vector of [2 x double].
  2564. /// \returns the CF flag in the EFLAGS register.
  2565. static __inline int __DEFAULT_FN_ATTRS128
  2566. _mm_testc_pd(__m128d __a, __m128d __b)
  2567. {
  2568.   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
  2569. }
  2570.  
  2571. /// Given two 128-bit floating-point vectors of [2 x double], perform an
  2572. ///    element-by-element comparison of the double-precision element in the
  2573. ///    first source vector and the corresponding element in the second source
  2574. ///    vector.
  2575. ///
  2576. ///    The EFLAGS register is updated as follows: \n
  2577. ///    If there is at least one pair of double-precision elements where the
  2578. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2579. ///    ZF flag is set to 1. \n
  2580. ///    If there is at least one pair of double-precision elements where the
  2581. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2582. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2583. ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
  2584. ///    otherwise it returns 0.
  2585. ///
  2586. /// \headerfile <x86intrin.h>
  2587. ///
  2588. /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
  2589. ///
  2590. /// \param __a
  2591. ///    A 128-bit vector of [2 x double].
  2592. /// \param __b
  2593. ///    A 128-bit vector of [2 x double].
  2594. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
  2595. static __inline int __DEFAULT_FN_ATTRS128
  2596. _mm_testnzc_pd(__m128d __a, __m128d __b)
  2597. {
  2598.   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
  2599. }
  2600.  
  2601. /// Given two 128-bit floating-point vectors of [4 x float], perform an
  2602. ///    element-by-element comparison of the single-precision element in the
  2603. ///    first source vector and the corresponding element in the second source
  2604. ///    vector.
  2605. ///
  2606. ///    The EFLAGS register is updated as follows: \n
  2607. ///    If there is at least one pair of single-precision elements where the
  2608. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2609. ///    ZF flag is set to 1. \n
  2610. ///    If there is at least one pair of single-precision elements where the
  2611. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2612. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2613. ///    This intrinsic returns the value of the ZF flag.
  2614. ///
  2615. /// \headerfile <x86intrin.h>
  2616. ///
  2617. /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
  2618. ///
  2619. /// \param __a
  2620. ///    A 128-bit vector of [4 x float].
  2621. /// \param __b
  2622. ///    A 128-bit vector of [4 x float].
  2623. /// \returns the ZF flag.
  2624. static __inline int __DEFAULT_FN_ATTRS128
  2625. _mm_testz_ps(__m128 __a, __m128 __b)
  2626. {
  2627.   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
  2628. }
  2629.  
  2630. /// Given two 128-bit floating-point vectors of [4 x float], perform an
  2631. ///    element-by-element comparison of the single-precision element in the
  2632. ///    first source vector and the corresponding element in the second source
  2633. ///    vector.
  2634. ///
  2635. ///    The EFLAGS register is updated as follows: \n
  2636. ///    If there is at least one pair of single-precision elements where the
  2637. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2638. ///    ZF flag is set to 1. \n
  2639. ///    If there is at least one pair of single-precision elements where the
  2640. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2641. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2642. ///    This intrinsic returns the value of the CF flag.
  2643. ///
  2644. /// \headerfile <x86intrin.h>
  2645. ///
  2646. /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
  2647. ///
  2648. /// \param __a
  2649. ///    A 128-bit vector of [4 x float].
  2650. /// \param __b
  2651. ///    A 128-bit vector of [4 x float].
  2652. /// \returns the CF flag.
  2653. static __inline int __DEFAULT_FN_ATTRS128
  2654. _mm_testc_ps(__m128 __a, __m128 __b)
  2655. {
  2656.   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
  2657. }
  2658.  
  2659. /// Given two 128-bit floating-point vectors of [4 x float], perform an
  2660. ///    element-by-element comparison of the single-precision element in the
  2661. ///    first source vector and the corresponding element in the second source
  2662. ///    vector.
  2663. ///
  2664. ///    The EFLAGS register is updated as follows: \n
  2665. ///    If there is at least one pair of single-precision elements where the
  2666. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2667. ///    ZF flag is set to 1. \n
  2668. ///    If there is at least one pair of single-precision elements where the
  2669. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2670. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2671. ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
  2672. ///    otherwise it returns 0.
  2673. ///
  2674. /// \headerfile <x86intrin.h>
  2675. ///
  2676. /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
  2677. ///
  2678. /// \param __a
  2679. ///    A 128-bit vector of [4 x float].
  2680. /// \param __b
  2681. ///    A 128-bit vector of [4 x float].
  2682. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
  2683. static __inline int __DEFAULT_FN_ATTRS128
  2684. _mm_testnzc_ps(__m128 __a, __m128 __b)
  2685. {
  2686.   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
  2687. }
  2688.  
  2689. /// Given two 256-bit floating-point vectors of [4 x double], perform an
  2690. ///    element-by-element comparison of the double-precision elements in the
  2691. ///    first source vector and the corresponding elements in the second source
  2692. ///    vector.
  2693. ///
  2694. ///    The EFLAGS register is updated as follows: \n
  2695. ///    If there is at least one pair of double-precision elements where the
  2696. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2697. ///    ZF flag is set to 1. \n
  2698. ///    If there is at least one pair of double-precision elements where the
  2699. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2700. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2701. ///    This intrinsic returns the value of the ZF flag.
  2702. ///
  2703. /// \headerfile <x86intrin.h>
  2704. ///
  2705. /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
  2706. ///
  2707. /// \param __a
  2708. ///    A 256-bit vector of [4 x double].
  2709. /// \param __b
  2710. ///    A 256-bit vector of [4 x double].
  2711. /// \returns the ZF flag.
  2712. static __inline int __DEFAULT_FN_ATTRS
  2713. _mm256_testz_pd(__m256d __a, __m256d __b)
  2714. {
  2715.   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
  2716. }
  2717.  
  2718. /// Given two 256-bit floating-point vectors of [4 x double], perform an
  2719. ///    element-by-element comparison of the double-precision elements in the
  2720. ///    first source vector and the corresponding elements in the second source
  2721. ///    vector.
  2722. ///
  2723. ///    The EFLAGS register is updated as follows: \n
  2724. ///    If there is at least one pair of double-precision elements where the
  2725. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2726. ///    ZF flag is set to 1. \n
  2727. ///    If there is at least one pair of double-precision elements where the
  2728. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2729. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2730. ///    This intrinsic returns the value of the CF flag.
  2731. ///
  2732. /// \headerfile <x86intrin.h>
  2733. ///
  2734. /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
  2735. ///
  2736. /// \param __a
  2737. ///    A 256-bit vector of [4 x double].
  2738. /// \param __b
  2739. ///    A 256-bit vector of [4 x double].
  2740. /// \returns the CF flag.
  2741. static __inline int __DEFAULT_FN_ATTRS
  2742. _mm256_testc_pd(__m256d __a, __m256d __b)
  2743. {
  2744.   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
  2745. }
  2746.  
  2747. /// Given two 256-bit floating-point vectors of [4 x double], perform an
  2748. ///    element-by-element comparison of the double-precision elements in the
  2749. ///    first source vector and the corresponding elements in the second source
  2750. ///    vector.
  2751. ///
  2752. ///    The EFLAGS register is updated as follows: \n
  2753. ///    If there is at least one pair of double-precision elements where the
  2754. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2755. ///    ZF flag is set to 1. \n
  2756. ///    If there is at least one pair of double-precision elements where the
  2757. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2758. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2759. ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
  2760. ///    otherwise it returns 0.
  2761. ///
  2762. /// \headerfile <x86intrin.h>
  2763. ///
  2764. /// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
  2765. ///
  2766. /// \param __a
  2767. ///    A 256-bit vector of [4 x double].
  2768. /// \param __b
  2769. ///    A 256-bit vector of [4 x double].
  2770. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
  2771. static __inline int __DEFAULT_FN_ATTRS
  2772. _mm256_testnzc_pd(__m256d __a, __m256d __b)
  2773. {
  2774.   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
  2775. }
  2776.  
  2777. /// Given two 256-bit floating-point vectors of [8 x float], perform an
  2778. ///    element-by-element comparison of the single-precision element in the
  2779. ///    first source vector and the corresponding element in the second source
  2780. ///    vector.
  2781. ///
  2782. ///    The EFLAGS register is updated as follows: \n
  2783. ///    If there is at least one pair of single-precision elements where the
  2784. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2785. ///    ZF flag is set to 1. \n
  2786. ///    If there is at least one pair of single-precision elements where the
  2787. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2788. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2789. ///    This intrinsic returns the value of the ZF flag.
  2790. ///
  2791. /// \headerfile <x86intrin.h>
  2792. ///
  2793. /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
  2794. ///
  2795. /// \param __a
  2796. ///    A 256-bit vector of [8 x float].
  2797. /// \param __b
  2798. ///    A 256-bit vector of [8 x float].
  2799. /// \returns the ZF flag.
  2800. static __inline int __DEFAULT_FN_ATTRS
  2801. _mm256_testz_ps(__m256 __a, __m256 __b)
  2802. {
  2803.   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
  2804. }
  2805.  
  2806. /// Given two 256-bit floating-point vectors of [8 x float], perform an
  2807. ///    element-by-element comparison of the single-precision element in the
  2808. ///    first source vector and the corresponding element in the second source
  2809. ///    vector.
  2810. ///
  2811. ///    The EFLAGS register is updated as follows: \n
  2812. ///    If there is at least one pair of single-precision elements where the
  2813. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2814. ///    ZF flag is set to 1. \n
  2815. ///    If there is at least one pair of single-precision elements where the
  2816. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2817. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2818. ///    This intrinsic returns the value of the CF flag.
  2819. ///
  2820. /// \headerfile <x86intrin.h>
  2821. ///
  2822. /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
  2823. ///
  2824. /// \param __a
  2825. ///    A 256-bit vector of [8 x float].
  2826. /// \param __b
  2827. ///    A 256-bit vector of [8 x float].
  2828. /// \returns the CF flag.
  2829. static __inline int __DEFAULT_FN_ATTRS
  2830. _mm256_testc_ps(__m256 __a, __m256 __b)
  2831. {
  2832.   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
  2833. }
  2834.  
  2835. /// Given two 256-bit floating-point vectors of [8 x float], perform an
  2836. ///    element-by-element comparison of the single-precision elements in the
  2837. ///    first source vector and the corresponding elements in the second source
  2838. ///    vector.
  2839. ///
  2840. ///    The EFLAGS register is updated as follows: \n
  2841. ///    If there is at least one pair of single-precision elements where the
  2842. ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
  2843. ///    ZF flag is set to 1. \n
  2844. ///    If there is at least one pair of single-precision elements where the
  2845. ///    sign-bit of the first element is 0 and the sign-bit of the second element
  2846. ///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
  2847. ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
  2848. ///    otherwise it returns 0.
  2849. ///
  2850. /// \headerfile <x86intrin.h>
  2851. ///
  2852. /// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
  2853. ///
  2854. /// \param __a
  2855. ///    A 256-bit vector of [8 x float].
  2856. /// \param __b
  2857. ///    A 256-bit vector of [8 x float].
  2858. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
  2859. static __inline int __DEFAULT_FN_ATTRS
  2860. _mm256_testnzc_ps(__m256 __a, __m256 __b)
  2861. {
  2862.   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
  2863. }
  2864.  
  2865. /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
  2866. ///    of the two source vectors.
  2867. ///
  2868. ///    The EFLAGS register is updated as follows: \n
  2869. ///    If there is at least one pair of bits where both bits are 1, the ZF flag
  2870. ///    is set to 0. Otherwise the ZF flag is set to 1. \n
  2871. ///    If there is at least one pair of bits where the bit from the first source
  2872. ///    vector is 0 and the bit from the second source vector is 1, the CF flag
  2873. ///    is set to 0. Otherwise the CF flag is set to 1. \n
  2874. ///    This intrinsic returns the value of the ZF flag.
  2875. ///
  2876. /// \headerfile <x86intrin.h>
  2877. ///
  2878. /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
  2879. ///
  2880. /// \param __a
  2881. ///    A 256-bit integer vector.
  2882. /// \param __b
  2883. ///    A 256-bit integer vector.
  2884. /// \returns the ZF flag.
  2885. static __inline int __DEFAULT_FN_ATTRS
  2886. _mm256_testz_si256(__m256i __a, __m256i __b)
  2887. {
  2888.   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
  2889. }
  2890.  
  2891. /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
  2892. ///    of the two source vectors.
  2893. ///
  2894. ///    The EFLAGS register is updated as follows: \n
  2895. ///    If there is at least one pair of bits where both bits are 1, the ZF flag
  2896. ///    is set to 0. Otherwise the ZF flag is set to 1. \n
  2897. ///    If there is at least one pair of bits where the bit from the first source
  2898. ///    vector is 0 and the bit from the second source vector is 1, the CF flag
  2899. ///    is set to 0. Otherwise the CF flag is set to 1. \n
  2900. ///    This intrinsic returns the value of the CF flag.
  2901. ///
  2902. /// \headerfile <x86intrin.h>
  2903. ///
  2904. /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
  2905. ///
  2906. /// \param __a
  2907. ///    A 256-bit integer vector.
  2908. /// \param __b
  2909. ///    A 256-bit integer vector.
  2910. /// \returns the CF flag.
  2911. static __inline int __DEFAULT_FN_ATTRS
  2912. _mm256_testc_si256(__m256i __a, __m256i __b)
  2913. {
  2914.   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
  2915. }
  2916.  
  2917. /// Given two 256-bit integer vectors, perform a bit-by-bit comparison
  2918. ///    of the two source vectors.
  2919. ///
  2920. ///    The EFLAGS register is updated as follows: \n
  2921. ///    If there is at least one pair of bits where both bits are 1, the ZF flag
  2922. ///    is set to 0. Otherwise the ZF flag is set to 1. \n
  2923. ///    If there is at least one pair of bits where the bit from the first source
  2924. ///    vector is 0 and the bit from the second source vector is 1, the CF flag
  2925. ///    is set to 0. Otherwise the CF flag is set to 1. \n
  2926. ///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
  2927. ///    otherwise it returns 0.
  2928. ///
  2929. /// \headerfile <x86intrin.h>
  2930. ///
  2931. /// This intrinsic corresponds to the <c> VPTEST </c> instruction.
  2932. ///
  2933. /// \param __a
  2934. ///    A 256-bit integer vector.
  2935. /// \param __b
  2936. ///    A 256-bit integer vector.
  2937. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
  2938. static __inline int __DEFAULT_FN_ATTRS
  2939. _mm256_testnzc_si256(__m256i __a, __m256i __b)
  2940. {
  2941.   return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
  2942. }
  2943.  
  2944. /* Vector extract sign mask */
  2945. /// Extracts the sign bits of double-precision floating point elements
  2946. ///    in a 256-bit vector of [4 x double] and writes them to the lower order
  2947. ///    bits of the return value.
  2948. ///
  2949. /// \headerfile <x86intrin.h>
  2950. ///
  2951. /// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
  2952. ///
  2953. /// \param __a
  2954. ///    A 256-bit vector of [4 x double] containing the double-precision
  2955. ///    floating point values with sign bits to be extracted.
  2956. /// \returns The sign bits from the operand, written to bits [3:0].
  2957. static __inline int __DEFAULT_FN_ATTRS
  2958. _mm256_movemask_pd(__m256d __a)
  2959. {
  2960.   return __builtin_ia32_movmskpd256((__v4df)__a);
  2961. }
  2962.  
  2963. /// Extracts the sign bits of single-precision floating point elements
  2964. ///    in a 256-bit vector of [8 x float] and writes them to the lower order
  2965. ///    bits of the return value.
  2966. ///
  2967. /// \headerfile <x86intrin.h>
  2968. ///
  2969. /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
  2970. ///
  2971. /// \param __a
  2972. ///    A 256-bit vector of [8 x float] containing the single-precision floating
  2973. ///    point values with sign bits to be extracted.
  2974. /// \returns The sign bits from the operand, written to bits [7:0].
  2975. static __inline int __DEFAULT_FN_ATTRS
  2976. _mm256_movemask_ps(__m256 __a)
  2977. {
  2978.   return __builtin_ia32_movmskps256((__v8sf)__a);
  2979. }
  2980.  
  2981. /* Vector __zero */
  2982. /// Zeroes the contents of all XMM or YMM registers.
  2983. ///
  2984. /// \headerfile <x86intrin.h>
  2985. ///
  2986. /// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
  2987. static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
  2988. _mm256_zeroall(void)
  2989. {
  2990.   __builtin_ia32_vzeroall();
  2991. }
  2992.  
  2993. /// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
  2994. ///
  2995. /// \headerfile <x86intrin.h>
  2996. ///
  2997. /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
  2998. static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
  2999. _mm256_zeroupper(void)
  3000. {
  3001.   __builtin_ia32_vzeroupper();
  3002. }
  3003.  
  3004. /* Vector load with broadcast */
  3005. /// Loads a scalar single-precision floating point value from the
  3006. ///    specified address pointed to by \a __a and broadcasts it to the elements
  3007. ///    of a [4 x float] vector.
  3008. ///
  3009. /// \headerfile <x86intrin.h>
  3010. ///
  3011. /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
  3012. ///
  3013. /// \param __a
  3014. ///    The single-precision floating point value to be broadcast.
  3015. /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
  3016. ///    equal to the broadcast value.
  3017. static __inline __m128 __DEFAULT_FN_ATTRS128
  3018. _mm_broadcast_ss(float const *__a)
  3019. {
  3020.   float __f = *__a;
  3021.   return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
  3022. }
  3023.  
  3024. /// Loads a scalar double-precision floating point value from the
  3025. ///    specified address pointed to by \a __a and broadcasts it to the elements
  3026. ///    of a [4 x double] vector.
  3027. ///
  3028. /// \headerfile <x86intrin.h>
  3029. ///
  3030. /// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
  3031. ///
  3032. /// \param __a
  3033. ///    The double-precision floating point value to be broadcast.
  3034. /// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
  3035. ///    equal to the broadcast value.
  3036. static __inline __m256d __DEFAULT_FN_ATTRS
  3037. _mm256_broadcast_sd(double const *__a)
  3038. {
  3039.   double __d = *__a;
  3040.   return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
  3041. }
  3042.  
  3043. /// Loads a scalar single-precision floating point value from the
  3044. ///    specified address pointed to by \a __a and broadcasts it to the elements
  3045. ///    of a [8 x float] vector.
  3046. ///
  3047. /// \headerfile <x86intrin.h>
  3048. ///
  3049. /// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
  3050. ///
  3051. /// \param __a
  3052. ///    The single-precision floating point value to be broadcast.
  3053. /// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
  3054. ///    equal to the broadcast value.
  3055. static __inline __m256 __DEFAULT_FN_ATTRS
  3056. _mm256_broadcast_ss(float const *__a)
  3057. {
  3058.   float __f = *__a;
  3059.   return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
  3060. }
  3061.  
  3062. /// Loads the data from a 128-bit vector of [2 x double] from the
  3063. ///    specified address pointed to by \a __a and broadcasts it to 128-bit
  3064. ///    elements in a 256-bit vector of [4 x double].
  3065. ///
  3066. /// \headerfile <x86intrin.h>
  3067. ///
  3068. /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
  3069. ///
  3070. /// \param __a
  3071. ///    The 128-bit vector of [2 x double] to be broadcast.
  3072. /// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
  3073. ///    equal to the broadcast value.
  3074. static __inline __m256d __DEFAULT_FN_ATTRS
  3075. _mm256_broadcast_pd(__m128d const *__a)
  3076. {
  3077.   __m128d __b = _mm_loadu_pd((const double *)__a);
  3078.   return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
  3079.                                           0, 1, 0, 1);
  3080. }
  3081.  
  3082. /// Loads the data from a 128-bit vector of [4 x float] from the
  3083. ///    specified address pointed to by \a __a and broadcasts it to 128-bit
  3084. ///    elements in a 256-bit vector of [8 x float].
  3085. ///
  3086. /// \headerfile <x86intrin.h>
  3087. ///
  3088. /// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
  3089. ///
  3090. /// \param __a
  3091. ///    The 128-bit vector of [4 x float] to be broadcast.
  3092. /// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
  3093. ///    equal to the broadcast value.
  3094. static __inline __m256 __DEFAULT_FN_ATTRS
  3095. _mm256_broadcast_ps(__m128 const *__a)
  3096. {
  3097.   __m128 __b = _mm_loadu_ps((const float *)__a);
  3098.   return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
  3099.                                          0, 1, 2, 3, 0, 1, 2, 3);
  3100. }
  3101.  
  3102. /* SIMD load ops */
  3103. /// Loads 4 double-precision floating point values from a 32-byte aligned
  3104. ///    memory location pointed to by \a __p into a vector of [4 x double].
  3105. ///
  3106. /// \headerfile <x86intrin.h>
  3107. ///
  3108. /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
  3109. ///
  3110. /// \param __p
  3111. ///    A 32-byte aligned pointer to a memory location containing
  3112. ///    double-precision floating point values.
  3113. /// \returns A 256-bit vector of [4 x double] containing the moved values.
  3114. static __inline __m256d __DEFAULT_FN_ATTRS
  3115. _mm256_load_pd(double const *__p)
  3116. {
  3117.   return *(const __m256d *)__p;
  3118. }
  3119.  
  3120. /// Loads 8 single-precision floating point values from a 32-byte aligned
  3121. ///    memory location pointed to by \a __p into a vector of [8 x float].
  3122. ///
  3123. /// \headerfile <x86intrin.h>
  3124. ///
  3125. /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
  3126. ///
  3127. /// \param __p
  3128. ///    A 32-byte aligned pointer to a memory location containing float values.
  3129. /// \returns A 256-bit vector of [8 x float] containing the moved values.
  3130. static __inline __m256 __DEFAULT_FN_ATTRS
  3131. _mm256_load_ps(float const *__p)
  3132. {
  3133.   return *(const __m256 *)__p;
  3134. }
  3135.  
  3136. /// Loads 4 double-precision floating point values from an unaligned
  3137. ///    memory location pointed to by \a __p into a vector of [4 x double].
  3138. ///
  3139. /// \headerfile <x86intrin.h>
  3140. ///
  3141. /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
  3142. ///
  3143. /// \param __p
  3144. ///    A pointer to a memory location containing double-precision floating
  3145. ///    point values.
  3146. /// \returns A 256-bit vector of [4 x double] containing the moved values.
  3147. static __inline __m256d __DEFAULT_FN_ATTRS
  3148. _mm256_loadu_pd(double const *__p)
  3149. {
  3150.   struct __loadu_pd {
  3151.     __m256d_u __v;
  3152.   } __attribute__((__packed__, __may_alias__));
  3153.   return ((const struct __loadu_pd*)__p)->__v;
  3154. }
  3155.  
  3156. /// Loads 8 single-precision floating point values from an unaligned
  3157. ///    memory location pointed to by \a __p into a vector of [8 x float].
  3158. ///
  3159. /// \headerfile <x86intrin.h>
  3160. ///
  3161. /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
  3162. ///
  3163. /// \param __p
  3164. ///    A pointer to a memory location containing single-precision floating
  3165. ///    point values.
  3166. /// \returns A 256-bit vector of [8 x float] containing the moved values.
  3167. static __inline __m256 __DEFAULT_FN_ATTRS
  3168. _mm256_loadu_ps(float const *__p)
  3169. {
  3170.   struct __loadu_ps {
  3171.     __m256_u __v;
  3172.   } __attribute__((__packed__, __may_alias__));
  3173.   return ((const struct __loadu_ps*)__p)->__v;
  3174. }
  3175.  
  3176. /// Loads 256 bits of integer data from a 32-byte aligned memory
  3177. ///    location pointed to by \a __p into elements of a 256-bit integer vector.
  3178. ///
  3179. /// \headerfile <x86intrin.h>
  3180. ///
  3181. /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
  3182. ///
  3183. /// \param __p
  3184. ///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
  3185. ///    values.
  3186. /// \returns A 256-bit integer vector containing the moved values.
  3187. static __inline __m256i __DEFAULT_FN_ATTRS
  3188. _mm256_load_si256(__m256i const *__p)
  3189. {
  3190.   return *__p;
  3191. }
  3192.  
  3193. /// Loads 256 bits of integer data from an unaligned memory location
  3194. ///    pointed to by \a __p into a 256-bit integer vector.
  3195. ///
  3196. /// \headerfile <x86intrin.h>
  3197. ///
  3198. /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
  3199. ///
  3200. /// \param __p
  3201. ///    A pointer to a 256-bit integer vector containing integer values.
  3202. /// \returns A 256-bit integer vector containing the moved values.
  3203. static __inline __m256i __DEFAULT_FN_ATTRS
  3204. _mm256_loadu_si256(__m256i_u const *__p)
  3205. {
  3206.   struct __loadu_si256 {
  3207.     __m256i_u __v;
  3208.   } __attribute__((__packed__, __may_alias__));
  3209.   return ((const struct __loadu_si256*)__p)->__v;
  3210. }
  3211.  
  3212. /// Loads 256 bits of integer data from an unaligned memory location
  3213. ///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
  3214. ///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
  3215. ///    line boundary.
  3216. ///
  3217. /// \headerfile <x86intrin.h>
  3218. ///
  3219. /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
  3220. ///
  3221. /// \param __p
  3222. ///    A pointer to a 256-bit integer vector containing integer values.
  3223. /// \returns A 256-bit integer vector containing the moved values.
  3224. static __inline __m256i __DEFAULT_FN_ATTRS
  3225. _mm256_lddqu_si256(__m256i_u const *__p)
  3226. {
  3227.   return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
  3228. }
  3229.  
  3230. /* SIMD store ops */
  3231. /// Stores double-precision floating point values from a 256-bit vector
  3232. ///    of [4 x double] to a 32-byte aligned memory location pointed to by
  3233. ///    \a __p.
  3234. ///
  3235. /// \headerfile <x86intrin.h>
  3236. ///
  3237. /// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
  3238. ///
  3239. /// \param __p
  3240. ///    A 32-byte aligned pointer to a memory location that will receive the
  3241. ///    double-precision floaing point values.
  3242. /// \param __a
  3243. ///    A 256-bit vector of [4 x double] containing the values to be moved.
  3244. static __inline void __DEFAULT_FN_ATTRS
  3245. _mm256_store_pd(double *__p, __m256d __a)
  3246. {
  3247.   *(__m256d *)__p = __a;
  3248. }
  3249.  
  3250. /// Stores single-precision floating point values from a 256-bit vector
  3251. ///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
  3252. ///
  3253. /// \headerfile <x86intrin.h>
  3254. ///
  3255. /// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
  3256. ///
  3257. /// \param __p
  3258. ///    A 32-byte aligned pointer to a memory location that will receive the
  3259. ///    float values.
  3260. /// \param __a
  3261. ///    A 256-bit vector of [8 x float] containing the values to be moved.
  3262. static __inline void __DEFAULT_FN_ATTRS
  3263. _mm256_store_ps(float *__p, __m256 __a)
  3264. {
  3265.   *(__m256 *)__p = __a;
  3266. }
  3267.  
  3268. /// Stores double-precision floating point values from a 256-bit vector
  3269. ///    of [4 x double] to an unaligned memory location pointed to by \a __p.
  3270. ///
  3271. /// \headerfile <x86intrin.h>
  3272. ///
  3273. /// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
  3274. ///
  3275. /// \param __p
  3276. ///    A pointer to a memory location that will receive the double-precision
  3277. ///    floating point values.
  3278. /// \param __a
  3279. ///    A 256-bit vector of [4 x double] containing the values to be moved.
  3280. static __inline void __DEFAULT_FN_ATTRS
  3281. _mm256_storeu_pd(double *__p, __m256d __a)
  3282. {
  3283.   struct __storeu_pd {
  3284.     __m256d_u __v;
  3285.   } __attribute__((__packed__, __may_alias__));
  3286.   ((struct __storeu_pd*)__p)->__v = __a;
  3287. }
  3288.  
  3289. /// Stores single-precision floating point values from a 256-bit vector
  3290. ///    of [8 x float] to an unaligned memory location pointed to by \a __p.
  3291. ///
  3292. /// \headerfile <x86intrin.h>
  3293. ///
  3294. /// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
  3295. ///
  3296. /// \param __p
  3297. ///    A pointer to a memory location that will receive the float values.
  3298. /// \param __a
  3299. ///    A 256-bit vector of [8 x float] containing the values to be moved.
  3300. static __inline void __DEFAULT_FN_ATTRS
  3301. _mm256_storeu_ps(float *__p, __m256 __a)
  3302. {
  3303.   struct __storeu_ps {
  3304.     __m256_u __v;
  3305.   } __attribute__((__packed__, __may_alias__));
  3306.   ((struct __storeu_ps*)__p)->__v = __a;
  3307. }
  3308.  
  3309. /// Stores integer values from a 256-bit integer vector to a 32-byte
  3310. ///    aligned memory location pointed to by \a __p.
  3311. ///
  3312. /// \headerfile <x86intrin.h>
  3313. ///
  3314. /// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
  3315. ///
  3316. /// \param __p
  3317. ///    A 32-byte aligned pointer to a memory location that will receive the
  3318. ///    integer values.
  3319. /// \param __a
  3320. ///    A 256-bit integer vector containing the values to be moved.
  3321. static __inline void __DEFAULT_FN_ATTRS
  3322. _mm256_store_si256(__m256i *__p, __m256i __a)
  3323. {
  3324.   *__p = __a;
  3325. }
  3326.  
  3327. /// Stores integer values from a 256-bit integer vector to an unaligned
  3328. ///    memory location pointed to by \a __p.
  3329. ///
  3330. /// \headerfile <x86intrin.h>
  3331. ///
  3332. /// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
  3333. ///
  3334. /// \param __p
  3335. ///    A pointer to a memory location that will receive the integer values.
  3336. /// \param __a
  3337. ///    A 256-bit integer vector containing the values to be moved.
  3338. static __inline void __DEFAULT_FN_ATTRS
  3339. _mm256_storeu_si256(__m256i_u *__p, __m256i __a)
  3340. {
  3341.   struct __storeu_si256 {
  3342.     __m256i_u __v;
  3343.   } __attribute__((__packed__, __may_alias__));
  3344.   ((struct __storeu_si256*)__p)->__v = __a;
  3345. }
  3346.  
  3347. /* Conditional load ops */
  3348. /// Conditionally loads double-precision floating point elements from a
  3349. ///    memory location pointed to by \a __p into a 128-bit vector of
  3350. ///    [2 x double], depending on the mask bits associated with each data
  3351. ///    element.
  3352. ///
  3353. /// \headerfile <x86intrin.h>
  3354. ///
  3355. /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
  3356. ///
  3357. /// \param __p
  3358. ///    A pointer to a memory location that contains the double-precision
  3359. ///    floating point values.
  3360. /// \param __m
  3361. ///    A 128-bit integer vector containing the mask. The most significant bit of
  3362. ///    each data element represents the mask bits. If a mask bit is zero, the
  3363. ///    corresponding value in the memory location is not loaded and the
  3364. ///    corresponding field in the return value is set to zero.
  3365. /// \returns A 128-bit vector of [2 x double] containing the loaded values.
  3366. static __inline __m128d __DEFAULT_FN_ATTRS128
  3367. _mm_maskload_pd(double const *__p, __m128i __m)
  3368. {
  3369.   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
  3370. }
  3371.  
  3372. /// Conditionally loads double-precision floating point elements from a
  3373. ///    memory location pointed to by \a __p into a 256-bit vector of
  3374. ///    [4 x double], depending on the mask bits associated with each data
  3375. ///    element.
  3376. ///
  3377. /// \headerfile <x86intrin.h>
  3378. ///
  3379. /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
  3380. ///
  3381. /// \param __p
  3382. ///    A pointer to a memory location that contains the double-precision
  3383. ///    floating point values.
  3384. /// \param __m
  3385. ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
  3386. ///    significant bit of each quadword element represents the mask bits. If a
  3387. ///    mask bit is zero, the corresponding value in the memory location is not
  3388. ///    loaded and the corresponding field in the return value is set to zero.
  3389. /// \returns A 256-bit vector of [4 x double] containing the loaded values.
  3390. static __inline __m256d __DEFAULT_FN_ATTRS
  3391. _mm256_maskload_pd(double const *__p, __m256i __m)
  3392. {
  3393.   return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
  3394.                                                (__v4di)__m);
  3395. }
  3396.  
  3397. /// Conditionally loads single-precision floating point elements from a
  3398. ///    memory location pointed to by \a __p into a 128-bit vector of
  3399. ///    [4 x float], depending on the mask bits associated with each data
  3400. ///    element.
  3401. ///
  3402. /// \headerfile <x86intrin.h>
  3403. ///
  3404. /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
  3405. ///
  3406. /// \param __p
  3407. ///    A pointer to a memory location that contains the single-precision
  3408. ///    floating point values.
  3409. /// \param __m
  3410. ///    A 128-bit integer vector containing the mask. The most significant bit of
  3411. ///    each data element represents the mask bits. If a mask bit is zero, the
  3412. ///    corresponding value in the memory location is not loaded and the
  3413. ///    corresponding field in the return value is set to zero.
  3414. /// \returns A 128-bit vector of [4 x float] containing the loaded values.
  3415. static __inline __m128 __DEFAULT_FN_ATTRS128
  3416. _mm_maskload_ps(float const *__p, __m128i __m)
  3417. {
  3418.   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
  3419. }
  3420.  
  3421. /// Conditionally loads single-precision floating point elements from a
  3422. ///    memory location pointed to by \a __p into a 256-bit vector of
  3423. ///    [8 x float], depending on the mask bits associated with each data
  3424. ///    element.
  3425. ///
  3426. /// \headerfile <x86intrin.h>
  3427. ///
  3428. /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
  3429. ///
  3430. /// \param __p
  3431. ///    A pointer to a memory location that contains the single-precision
  3432. ///    floating point values.
  3433. /// \param __m
  3434. ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
  3435. ///    significant bit of each dword element represents the mask bits. If a mask
  3436. ///    bit is zero, the corresponding value in the memory location is not loaded
  3437. ///    and the corresponding field in the return value is set to zero.
  3438. /// \returns A 256-bit vector of [8 x float] containing the loaded values.
  3439. static __inline __m256 __DEFAULT_FN_ATTRS
  3440. _mm256_maskload_ps(float const *__p, __m256i __m)
  3441. {
  3442.   return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
  3443. }
  3444.  
  3445. /* Conditional store ops */
  3446. /// Moves single-precision floating point values from a 256-bit vector
  3447. ///    of [8 x float] to a memory location pointed to by \a __p, according to
  3448. ///    the specified mask.
  3449. ///
  3450. /// \headerfile <x86intrin.h>
  3451. ///
  3452. /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
  3453. ///
  3454. /// \param __p
  3455. ///    A pointer to a memory location that will receive the float values.
  3456. /// \param __m
  3457. ///    A 256-bit integer vector of [8 x dword] containing the mask. The most
  3458. ///    significant bit of each dword element in the mask vector represents the
  3459. ///    mask bits. If a mask bit is zero, the corresponding value from vector
  3460. ///    \a __a is not stored and the corresponding field in the memory location
  3461. ///    pointed to by \a __p is not changed.
  3462. /// \param __a
  3463. ///    A 256-bit vector of [8 x float] containing the values to be stored.
  3464. static __inline void __DEFAULT_FN_ATTRS
  3465. _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
  3466. {
  3467.   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
  3468. }
  3469.  
  3470. /// Moves double-precision values from a 128-bit vector of [2 x double]
  3471. ///    to a memory location pointed to by \a __p, according to the specified
  3472. ///    mask.
  3473. ///
  3474. /// \headerfile <x86intrin.h>
  3475. ///
  3476. /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
  3477. ///
  3478. /// \param __p
  3479. ///    A pointer to a memory location that will receive the float values.
  3480. /// \param __m
  3481. ///    A 128-bit integer vector containing the mask. The most significant bit of
  3482. ///    each field in the mask vector represents the mask bits. If a mask bit is
  3483. ///    zero, the corresponding value from vector \a __a is not stored and the
  3484. ///    corresponding field in the memory location pointed to by \a __p is not
  3485. ///    changed.
  3486. /// \param __a
  3487. ///    A 128-bit vector of [2 x double] containing the values to be stored.
  3488. static __inline void __DEFAULT_FN_ATTRS128
  3489. _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
  3490. {
  3491.   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
  3492. }
  3493.  
  3494. /// Moves double-precision values from a 256-bit vector of [4 x double]
  3495. ///    to a memory location pointed to by \a __p, according to the specified
  3496. ///    mask.
  3497. ///
  3498. /// \headerfile <x86intrin.h>
  3499. ///
  3500. /// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
  3501. ///
  3502. /// \param __p
  3503. ///    A pointer to a memory location that will receive the float values.
  3504. /// \param __m
  3505. ///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
  3506. ///    significant bit of each quadword element in the mask vector represents
  3507. ///    the mask bits. If a mask bit is zero, the corresponding value from vector
  3508. ///    __a is not stored and the corresponding field in the memory location
  3509. ///    pointed to by \a __p is not changed.
  3510. /// \param __a
  3511. ///    A 256-bit vector of [4 x double] containing the values to be stored.
  3512. static __inline void __DEFAULT_FN_ATTRS
  3513. _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
  3514. {
  3515.   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
  3516. }
  3517.  
  3518. /// Moves single-precision floating point values from a 128-bit vector
  3519. ///    of [4 x float] to a memory location pointed to by \a __p, according to
  3520. ///    the specified mask.
  3521. ///
  3522. /// \headerfile <x86intrin.h>
  3523. ///
  3524. /// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
  3525. ///
  3526. /// \param __p
  3527. ///    A pointer to a memory location that will receive the float values.
  3528. /// \param __m
  3529. ///    A 128-bit integer vector containing the mask. The most significant bit of
  3530. ///    each field in the mask vector represents the mask bits. If a mask bit is
  3531. ///    zero, the corresponding value from vector __a is not stored and the
  3532. ///    corresponding field in the memory location pointed to by \a __p is not
  3533. ///    changed.
  3534. /// \param __a
  3535. ///    A 128-bit vector of [4 x float] containing the values to be stored.
  3536. static __inline void __DEFAULT_FN_ATTRS128
  3537. _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
  3538. {
  3539.   __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
  3540. }
  3541.  
  3542. /* Cacheability support ops */
  3543. /// Moves integer data from a 256-bit integer vector to a 32-byte
  3544. ///    aligned memory location. To minimize caching, the data is flagged as
  3545. ///    non-temporal (unlikely to be used again soon).
  3546. ///
  3547. /// \headerfile <x86intrin.h>
  3548. ///
  3549. /// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
  3550. ///
  3551. /// \param __a
  3552. ///    A pointer to a 32-byte aligned memory location that will receive the
  3553. ///    integer values.
  3554. /// \param __b
  3555. ///    A 256-bit integer vector containing the values to be moved.
  3556. static __inline void __DEFAULT_FN_ATTRS
  3557. _mm256_stream_si256(__m256i *__a, __m256i __b)
  3558. {
  3559.   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
  3560.   __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
  3561. }
  3562.  
  3563. /// Moves double-precision values from a 256-bit vector of [4 x double]
  3564. ///    to a 32-byte aligned memory location. To minimize caching, the data is
  3565. ///    flagged as non-temporal (unlikely to be used again soon).
  3566. ///
  3567. /// \headerfile <x86intrin.h>
  3568. ///
  3569. /// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
  3570. ///
  3571. /// \param __a
  3572. ///    A pointer to a 32-byte aligned memory location that will receive the
  3573. ///    double-precision floating-point values.
  3574. /// \param __b
  3575. ///    A 256-bit vector of [4 x double] containing the values to be moved.
  3576. static __inline void __DEFAULT_FN_ATTRS
  3577. _mm256_stream_pd(double *__a, __m256d __b)
  3578. {
  3579.   typedef __v4df __v4df_aligned __attribute__((aligned(32)));
  3580.   __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
  3581. }
  3582.  
  3583. /// Moves single-precision floating point values from a 256-bit vector
  3584. ///    of [8 x float] to a 32-byte aligned memory location. To minimize
  3585. ///    caching, the data is flagged as non-temporal (unlikely to be used again
  3586. ///    soon).
  3587. ///
  3588. /// \headerfile <x86intrin.h>
  3589. ///
  3590. /// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
  3591. ///
  3592. /// \param __p
  3593. ///    A pointer to a 32-byte aligned memory location that will receive the
  3594. ///    single-precision floating point values.
  3595. /// \param __a
  3596. ///    A 256-bit vector of [8 x float] containing the values to be moved.
  3597. static __inline void __DEFAULT_FN_ATTRS
  3598. _mm256_stream_ps(float *__p, __m256 __a)
  3599. {
  3600.   typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
  3601.   __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
  3602. }
  3603.  
  3604. /* Create vectors */
  3605. /// Create a 256-bit vector of [4 x double] with undefined values.
  3606. ///
  3607. /// \headerfile <x86intrin.h>
  3608. ///
  3609. /// This intrinsic has no corresponding instruction.
  3610. ///
  3611. /// \returns A 256-bit vector of [4 x double] containing undefined values.
  3612. static __inline__ __m256d __DEFAULT_FN_ATTRS
  3613. _mm256_undefined_pd(void)
  3614. {
  3615.   return (__m256d)__builtin_ia32_undef256();
  3616. }
  3617.  
  3618. /// Create a 256-bit vector of [8 x float] with undefined values.
  3619. ///
  3620. /// \headerfile <x86intrin.h>
  3621. ///
  3622. /// This intrinsic has no corresponding instruction.
  3623. ///
  3624. /// \returns A 256-bit vector of [8 x float] containing undefined values.
  3625. static __inline__ __m256 __DEFAULT_FN_ATTRS
  3626. _mm256_undefined_ps(void)
  3627. {
  3628.   return (__m256)__builtin_ia32_undef256();
  3629. }
  3630.  
  3631. /// Create a 256-bit integer vector with undefined values.
  3632. ///
  3633. /// \headerfile <x86intrin.h>
  3634. ///
  3635. /// This intrinsic has no corresponding instruction.
  3636. ///
  3637. /// \returns A 256-bit integer vector containing undefined values.
  3638. static __inline__ __m256i __DEFAULT_FN_ATTRS
  3639. _mm256_undefined_si256(void)
  3640. {
  3641.   return (__m256i)__builtin_ia32_undef256();
  3642. }
  3643.  
  3644. /// Constructs a 256-bit floating-point vector of [4 x double]
  3645. ///    initialized with the specified double-precision floating-point values.
  3646. ///
  3647. /// \headerfile <x86intrin.h>
  3648. ///
  3649. /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
  3650. ///   instruction.
  3651. ///
  3652. /// \param __a
  3653. ///    A double-precision floating-point value used to initialize bits [255:192]
  3654. ///    of the result.
  3655. /// \param __b
  3656. ///    A double-precision floating-point value used to initialize bits [191:128]
  3657. ///    of the result.
  3658. /// \param __c
  3659. ///    A double-precision floating-point value used to initialize bits [127:64]
  3660. ///    of the result.
  3661. /// \param __d
  3662. ///    A double-precision floating-point value used to initialize bits [63:0]
  3663. ///    of the result.
  3664. /// \returns An initialized 256-bit floating-point vector of [4 x double].
  3665. static __inline __m256d __DEFAULT_FN_ATTRS
  3666. _mm256_set_pd(double __a, double __b, double __c, double __d)
  3667. {
  3668.   return __extension__ (__m256d){ __d, __c, __b, __a };
  3669. }
  3670.  
  3671. /// Constructs a 256-bit floating-point vector of [8 x float] initialized
  3672. ///    with the specified single-precision floating-point values.
  3673. ///
  3674. /// \headerfile <x86intrin.h>
  3675. ///
  3676. /// This intrinsic is a utility function and does not correspond to a specific
  3677. ///   instruction.
  3678. ///
  3679. /// \param __a
  3680. ///    A single-precision floating-point value used to initialize bits [255:224]
  3681. ///    of the result.
  3682. /// \param __b
  3683. ///    A single-precision floating-point value used to initialize bits [223:192]
  3684. ///    of the result.
  3685. /// \param __c
  3686. ///    A single-precision floating-point value used to initialize bits [191:160]
  3687. ///    of the result.
  3688. /// \param __d
  3689. ///    A single-precision floating-point value used to initialize bits [159:128]
  3690. ///    of the result.
  3691. /// \param __e
  3692. ///    A single-precision floating-point value used to initialize bits [127:96]
  3693. ///    of the result.
  3694. /// \param __f
  3695. ///    A single-precision floating-point value used to initialize bits [95:64]
  3696. ///    of the result.
  3697. /// \param __g
  3698. ///    A single-precision floating-point value used to initialize bits [63:32]
  3699. ///    of the result.
  3700. /// \param __h
  3701. ///    A single-precision floating-point value used to initialize bits [31:0]
  3702. ///    of the result.
  3703. /// \returns An initialized 256-bit floating-point vector of [8 x float].
  3704. static __inline __m256 __DEFAULT_FN_ATTRS
  3705. _mm256_set_ps(float __a, float __b, float __c, float __d,
  3706.               float __e, float __f, float __g, float __h)
  3707. {
  3708.   return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
  3709. }
  3710.  
  3711. /// Constructs a 256-bit integer vector initialized with the specified
  3712. ///    32-bit integral values.
  3713. ///
  3714. /// \headerfile <x86intrin.h>
  3715. ///
  3716. /// This intrinsic is a utility function and does not correspond to a specific
  3717. ///   instruction.
  3718. ///
  3719. /// \param __i0
  3720. ///    A 32-bit integral value used to initialize bits [255:224] of the result.
  3721. /// \param __i1
  3722. ///    A 32-bit integral value used to initialize bits [223:192] of the result.
  3723. /// \param __i2
  3724. ///    A 32-bit integral value used to initialize bits [191:160] of the result.
  3725. /// \param __i3
  3726. ///    A 32-bit integral value used to initialize bits [159:128] of the result.
  3727. /// \param __i4
  3728. ///    A 32-bit integral value used to initialize bits [127:96] of the result.
  3729. /// \param __i5
  3730. ///    A 32-bit integral value used to initialize bits [95:64] of the result.
  3731. /// \param __i6
  3732. ///    A 32-bit integral value used to initialize bits [63:32] of the result.
  3733. /// \param __i7
  3734. ///    A 32-bit integral value used to initialize bits [31:0] of the result.
  3735. /// \returns An initialized 256-bit integer vector.
  3736. static __inline __m256i __DEFAULT_FN_ATTRS
  3737. _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
  3738.                  int __i4, int __i5, int __i6, int __i7)
  3739. {
  3740.   return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
  3741. }
  3742.  
  3743. /// Constructs a 256-bit integer vector initialized with the specified
  3744. ///    16-bit integral values.
  3745. ///
  3746. /// \headerfile <x86intrin.h>
  3747. ///
  3748. /// This intrinsic is a utility function and does not correspond to a specific
  3749. ///   instruction.
  3750. ///
  3751. /// \param __w15
  3752. ///    A 16-bit integral value used to initialize bits [255:240] of the result.
  3753. /// \param __w14
  3754. ///    A 16-bit integral value used to initialize bits [239:224] of the result.
  3755. /// \param __w13
  3756. ///    A 16-bit integral value used to initialize bits [223:208] of the result.
  3757. /// \param __w12
  3758. ///    A 16-bit integral value used to initialize bits [207:192] of the result.
  3759. /// \param __w11
  3760. ///    A 16-bit integral value used to initialize bits [191:176] of the result.
  3761. /// \param __w10
  3762. ///    A 16-bit integral value used to initialize bits [175:160] of the result.
  3763. /// \param __w09
  3764. ///    A 16-bit integral value used to initialize bits [159:144] of the result.
  3765. /// \param __w08
  3766. ///    A 16-bit integral value used to initialize bits [143:128] of the result.
  3767. /// \param __w07
  3768. ///    A 16-bit integral value used to initialize bits [127:112] of the result.
  3769. /// \param __w06
  3770. ///    A 16-bit integral value used to initialize bits [111:96] of the result.
  3771. /// \param __w05
  3772. ///    A 16-bit integral value used to initialize bits [95:80] of the result.
  3773. /// \param __w04
  3774. ///    A 16-bit integral value used to initialize bits [79:64] of the result.
  3775. /// \param __w03
  3776. ///    A 16-bit integral value used to initialize bits [63:48] of the result.
  3777. /// \param __w02
  3778. ///    A 16-bit integral value used to initialize bits [47:32] of the result.
  3779. /// \param __w01
  3780. ///    A 16-bit integral value used to initialize bits [31:16] of the result.
  3781. /// \param __w00
  3782. ///    A 16-bit integral value used to initialize bits [15:0] of the result.
  3783. /// \returns An initialized 256-bit integer vector.
  3784. static __inline __m256i __DEFAULT_FN_ATTRS
  3785. _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
  3786.                  short __w11, short __w10, short __w09, short __w08,
  3787.                  short __w07, short __w06, short __w05, short __w04,
  3788.                  short __w03, short __w02, short __w01, short __w00)
  3789. {
  3790.   return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
  3791.     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
  3792. }
  3793.  
  3794. /// Constructs a 256-bit integer vector initialized with the specified
  3795. ///    8-bit integral values.
  3796. ///
  3797. /// \headerfile <x86intrin.h>
  3798. ///
  3799. /// This intrinsic is a utility function and does not correspond to a specific
  3800. ///   instruction.
  3801. ///
  3802. /// \param __b31
  3803. ///    An 8-bit integral value used to initialize bits [255:248] of the result.
  3804. /// \param __b30
  3805. ///    An 8-bit integral value used to initialize bits [247:240] of the result.
  3806. /// \param __b29
  3807. ///    An 8-bit integral value used to initialize bits [239:232] of the result.
  3808. /// \param __b28
  3809. ///    An 8-bit integral value used to initialize bits [231:224] of the result.
  3810. /// \param __b27
  3811. ///    An 8-bit integral value used to initialize bits [223:216] of the result.
  3812. /// \param __b26
  3813. ///    An 8-bit integral value used to initialize bits [215:208] of the result.
  3814. /// \param __b25
  3815. ///    An 8-bit integral value used to initialize bits [207:200] of the result.
  3816. /// \param __b24
  3817. ///    An 8-bit integral value used to initialize bits [199:192] of the result.
  3818. /// \param __b23
  3819. ///    An 8-bit integral value used to initialize bits [191:184] of the result.
  3820. /// \param __b22
  3821. ///    An 8-bit integral value used to initialize bits [183:176] of the result.
  3822. /// \param __b21
  3823. ///    An 8-bit integral value used to initialize bits [175:168] of the result.
  3824. /// \param __b20
  3825. ///    An 8-bit integral value used to initialize bits [167:160] of the result.
  3826. /// \param __b19
  3827. ///    An 8-bit integral value used to initialize bits [159:152] of the result.
  3828. /// \param __b18
  3829. ///    An 8-bit integral value used to initialize bits [151:144] of the result.
  3830. /// \param __b17
  3831. ///    An 8-bit integral value used to initialize bits [143:136] of the result.
  3832. /// \param __b16
  3833. ///    An 8-bit integral value used to initialize bits [135:128] of the result.
  3834. /// \param __b15
  3835. ///    An 8-bit integral value used to initialize bits [127:120] of the result.
  3836. /// \param __b14
  3837. ///    An 8-bit integral value used to initialize bits [119:112] of the result.
  3838. /// \param __b13
  3839. ///    An 8-bit integral value used to initialize bits [111:104] of the result.
  3840. /// \param __b12
  3841. ///    An 8-bit integral value used to initialize bits [103:96] of the result.
  3842. /// \param __b11
  3843. ///    An 8-bit integral value used to initialize bits [95:88] of the result.
  3844. /// \param __b10
  3845. ///    An 8-bit integral value used to initialize bits [87:80] of the result.
  3846. /// \param __b09
  3847. ///    An 8-bit integral value used to initialize bits [79:72] of the result.
  3848. /// \param __b08
  3849. ///    An 8-bit integral value used to initialize bits [71:64] of the result.
  3850. /// \param __b07
  3851. ///    An 8-bit integral value used to initialize bits [63:56] of the result.
  3852. /// \param __b06
  3853. ///    An 8-bit integral value used to initialize bits [55:48] of the result.
  3854. /// \param __b05
  3855. ///    An 8-bit integral value used to initialize bits [47:40] of the result.
  3856. /// \param __b04
  3857. ///    An 8-bit integral value used to initialize bits [39:32] of the result.
  3858. /// \param __b03
  3859. ///    An 8-bit integral value used to initialize bits [31:24] of the result.
  3860. /// \param __b02
  3861. ///    An 8-bit integral value used to initialize bits [23:16] of the result.
  3862. /// \param __b01
  3863. ///    An 8-bit integral value used to initialize bits [15:8] of the result.
  3864. /// \param __b00
  3865. ///    An 8-bit integral value used to initialize bits [7:0] of the result.
  3866. /// \returns An initialized 256-bit integer vector.
  3867. static __inline __m256i __DEFAULT_FN_ATTRS
  3868. _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
  3869.                 char __b27, char __b26, char __b25, char __b24,
  3870.                 char __b23, char __b22, char __b21, char __b20,
  3871.                 char __b19, char __b18, char __b17, char __b16,
  3872.                 char __b15, char __b14, char __b13, char __b12,
  3873.                 char __b11, char __b10, char __b09, char __b08,
  3874.                 char __b07, char __b06, char __b05, char __b04,
  3875.                 char __b03, char __b02, char __b01, char __b00)
  3876. {
  3877.   return __extension__ (__m256i)(__v32qi){
  3878.     __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
  3879.     __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
  3880.     __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
  3881.     __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
  3882.   };
  3883. }
  3884.  
  3885. /// Constructs a 256-bit integer vector initialized with the specified
  3886. ///    64-bit integral values.
  3887. ///
  3888. /// \headerfile <x86intrin.h>
  3889. ///
  3890. /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
  3891. ///   instruction.
  3892. ///
  3893. /// \param __a
  3894. ///    A 64-bit integral value used to initialize bits [255:192] of the result.
  3895. /// \param __b
  3896. ///    A 64-bit integral value used to initialize bits [191:128] of the result.
  3897. /// \param __c
  3898. ///    A 64-bit integral value used to initialize bits [127:64] of the result.
  3899. /// \param __d
  3900. ///    A 64-bit integral value used to initialize bits [63:0] of the result.
  3901. /// \returns An initialized 256-bit integer vector.
  3902. static __inline __m256i __DEFAULT_FN_ATTRS
  3903. _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
  3904. {
  3905.   return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
  3906. }
  3907.  
  3908. /* Create vectors with elements in reverse order */
  3909. /// Constructs a 256-bit floating-point vector of [4 x double],
  3910. ///    initialized in reverse order with the specified double-precision
  3911. ///    floating-point values.
  3912. ///
  3913. /// \headerfile <x86intrin.h>
  3914. ///
  3915. /// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
  3916. ///   instruction.
  3917. ///
  3918. /// \param __a
  3919. ///    A double-precision floating-point value used to initialize bits [63:0]
  3920. ///    of the result.
  3921. /// \param __b
  3922. ///    A double-precision floating-point value used to initialize bits [127:64]
  3923. ///    of the result.
  3924. /// \param __c
  3925. ///    A double-precision floating-point value used to initialize bits [191:128]
  3926. ///    of the result.
  3927. /// \param __d
  3928. ///    A double-precision floating-point value used to initialize bits [255:192]
  3929. ///    of the result.
  3930. /// \returns An initialized 256-bit floating-point vector of [4 x double].
  3931. static __inline __m256d __DEFAULT_FN_ATTRS
  3932. _mm256_setr_pd(double __a, double __b, double __c, double __d)
  3933. {
  3934.   return _mm256_set_pd(__d, __c, __b, __a);
  3935. }
  3936.  
  3937. /// Constructs a 256-bit floating-point vector of [8 x float],
  3938. ///    initialized in reverse order with the specified single-precision
  3939. ///    float-point values.
  3940. ///
  3941. /// \headerfile <x86intrin.h>
  3942. ///
  3943. /// This intrinsic is a utility function and does not correspond to a specific
  3944. ///   instruction.
  3945. ///
  3946. /// \param __a
  3947. ///    A single-precision floating-point value used to initialize bits [31:0]
  3948. ///    of the result.
  3949. /// \param __b
  3950. ///    A single-precision floating-point value used to initialize bits [63:32]
  3951. ///    of the result.
  3952. /// \param __c
  3953. ///    A single-precision floating-point value used to initialize bits [95:64]
  3954. ///    of the result.
  3955. /// \param __d
  3956. ///    A single-precision floating-point value used to initialize bits [127:96]
  3957. ///    of the result.
  3958. /// \param __e
  3959. ///    A single-precision floating-point value used to initialize bits [159:128]
  3960. ///    of the result.
  3961. /// \param __f
  3962. ///    A single-precision floating-point value used to initialize bits [191:160]
  3963. ///    of the result.
  3964. /// \param __g
  3965. ///    A single-precision floating-point value used to initialize bits [223:192]
  3966. ///    of the result.
  3967. /// \param __h
  3968. ///    A single-precision floating-point value used to initialize bits [255:224]
  3969. ///    of the result.
  3970. /// \returns An initialized 256-bit floating-point vector of [8 x float].
  3971. static __inline __m256 __DEFAULT_FN_ATTRS
  3972. _mm256_setr_ps(float __a, float __b, float __c, float __d,
  3973.                float __e, float __f, float __g, float __h)
  3974. {
  3975.   return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
  3976. }
  3977.  
  3978. /// Constructs a 256-bit integer vector, initialized in reverse order
  3979. ///    with the specified 32-bit integral values.
  3980. ///
  3981. /// \headerfile <x86intrin.h>
  3982. ///
  3983. /// This intrinsic is a utility function and does not correspond to a specific
  3984. ///   instruction.
  3985. ///
  3986. /// \param __i0
  3987. ///    A 32-bit integral value used to initialize bits [31:0] of the result.
  3988. /// \param __i1
  3989. ///    A 32-bit integral value used to initialize bits [63:32] of the result.
  3990. /// \param __i2
  3991. ///    A 32-bit integral value used to initialize bits [95:64] of the result.
  3992. /// \param __i3
  3993. ///    A 32-bit integral value used to initialize bits [127:96] of the result.
  3994. /// \param __i4
  3995. ///    A 32-bit integral value used to initialize bits [159:128] of the result.
  3996. /// \param __i5
  3997. ///    A 32-bit integral value used to initialize bits [191:160] of the result.
  3998. /// \param __i6
  3999. ///    A 32-bit integral value used to initialize bits [223:192] of the result.
  4000. /// \param __i7
  4001. ///    A 32-bit integral value used to initialize bits [255:224] of the result.
  4002. /// \returns An initialized 256-bit integer vector.
  4003. static __inline __m256i __DEFAULT_FN_ATTRS
  4004. _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
  4005.                   int __i4, int __i5, int __i6, int __i7)
  4006. {
  4007.   return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
  4008. }
  4009.  
  4010. /// Constructs a 256-bit integer vector, initialized in reverse order
  4011. ///    with the specified 16-bit integral values.
  4012. ///
  4013. /// \headerfile <x86intrin.h>
  4014. ///
  4015. /// This intrinsic is a utility function and does not correspond to a specific
  4016. ///   instruction.
  4017. ///
  4018. /// \param __w15
  4019. ///    A 16-bit integral value used to initialize bits [15:0] of the result.
  4020. /// \param __w14
  4021. ///    A 16-bit integral value used to initialize bits [31:16] of the result.
  4022. /// \param __w13
  4023. ///    A 16-bit integral value used to initialize bits [47:32] of the result.
  4024. /// \param __w12
  4025. ///    A 16-bit integral value used to initialize bits [63:48] of the result.
  4026. /// \param __w11
  4027. ///    A 16-bit integral value used to initialize bits [79:64] of the result.
  4028. /// \param __w10
  4029. ///    A 16-bit integral value used to initialize bits [95:80] of the result.
  4030. /// \param __w09
  4031. ///    A 16-bit integral value used to initialize bits [111:96] of the result.
  4032. /// \param __w08
  4033. ///    A 16-bit integral value used to initialize bits [127:112] of the result.
  4034. /// \param __w07
  4035. ///    A 16-bit integral value used to initialize bits [143:128] of the result.
  4036. /// \param __w06
  4037. ///    A 16-bit integral value used to initialize bits [159:144] of the result.
  4038. /// \param __w05
  4039. ///    A 16-bit integral value used to initialize bits [175:160] of the result.
  4040. /// \param __w04
  4041. ///    A 16-bit integral value used to initialize bits [191:176] of the result.
  4042. /// \param __w03
  4043. ///    A 16-bit integral value used to initialize bits [207:192] of the result.
  4044. /// \param __w02
  4045. ///    A 16-bit integral value used to initialize bits [223:208] of the result.
  4046. /// \param __w01
  4047. ///    A 16-bit integral value used to initialize bits [239:224] of the result.
  4048. /// \param __w00
  4049. ///    A 16-bit integral value used to initialize bits [255:240] of the result.
  4050. /// \returns An initialized 256-bit integer vector.
  4051. static __inline __m256i __DEFAULT_FN_ATTRS
  4052. _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
  4053.        short __w11, short __w10, short __w09, short __w08,
  4054.        short __w07, short __w06, short __w05, short __w04,
  4055.        short __w03, short __w02, short __w01, short __w00)
  4056. {
  4057.   return _mm256_set_epi16(__w00, __w01, __w02, __w03,
  4058.                           __w04, __w05, __w06, __w07,
  4059.                           __w08, __w09, __w10, __w11,
  4060.                           __w12, __w13, __w14, __w15);
  4061. }
  4062.  
  4063. /// Constructs a 256-bit integer vector, initialized in reverse order
  4064. ///    with the specified 8-bit integral values.
  4065. ///
  4066. /// \headerfile <x86intrin.h>
  4067. ///
  4068. /// This intrinsic is a utility function and does not correspond to a specific
  4069. ///   instruction.
  4070. ///
  4071. /// \param __b31
  4072. ///    An 8-bit integral value used to initialize bits [7:0] of the result.
  4073. /// \param __b30
  4074. ///    An 8-bit integral value used to initialize bits [15:8] of the result.
  4075. /// \param __b29
  4076. ///    An 8-bit integral value used to initialize bits [23:16] of the result.
  4077. /// \param __b28
  4078. ///    An 8-bit integral value used to initialize bits [31:24] of the result.
  4079. /// \param __b27
  4080. ///    An 8-bit integral value used to initialize bits [39:32] of the result.
  4081. /// \param __b26
  4082. ///    An 8-bit integral value used to initialize bits [47:40] of the result.
  4083. /// \param __b25
  4084. ///    An 8-bit integral value used to initialize bits [55:48] of the result.
  4085. /// \param __b24
  4086. ///    An 8-bit integral value used to initialize bits [63:56] of the result.
  4087. /// \param __b23
  4088. ///    An 8-bit integral value used to initialize bits [71:64] of the result.
  4089. /// \param __b22
  4090. ///    An 8-bit integral value used to initialize bits [79:72] of the result.
  4091. /// \param __b21
  4092. ///    An 8-bit integral value used to initialize bits [87:80] of the result.
  4093. /// \param __b20
  4094. ///    An 8-bit integral value used to initialize bits [95:88] of the result.
  4095. /// \param __b19
  4096. ///    An 8-bit integral value used to initialize bits [103:96] of the result.
  4097. /// \param __b18
  4098. ///    An 8-bit integral value used to initialize bits [111:104] of the result.
  4099. /// \param __b17
  4100. ///    An 8-bit integral value used to initialize bits [119:112] of the result.
  4101. /// \param __b16
  4102. ///    An 8-bit integral value used to initialize bits [127:120] of the result.
  4103. /// \param __b15
  4104. ///    An 8-bit integral value used to initialize bits [135:128] of the result.
  4105. /// \param __b14
  4106. ///    An 8-bit integral value used to initialize bits [143:136] of the result.
  4107. /// \param __b13
  4108. ///    An 8-bit integral value used to initialize bits [151:144] of the result.
  4109. /// \param __b12
  4110. ///    An 8-bit integral value used to initialize bits [159:152] of the result.
  4111. /// \param __b11
  4112. ///    An 8-bit integral value used to initialize bits [167:160] of the result.
  4113. /// \param __b10
  4114. ///    An 8-bit integral value used to initialize bits [175:168] of the result.
  4115. /// \param __b09
  4116. ///    An 8-bit integral value used to initialize bits [183:176] of the result.
  4117. /// \param __b08
  4118. ///    An 8-bit integral value used to initialize bits [191:184] of the result.
  4119. /// \param __b07
  4120. ///    An 8-bit integral value used to initialize bits [199:192] of the result.
  4121. /// \param __b06
  4122. ///    An 8-bit integral value used to initialize bits [207:200] of the result.
  4123. /// \param __b05
  4124. ///    An 8-bit integral value used to initialize bits [215:208] of the result.
  4125. /// \param __b04
  4126. ///    An 8-bit integral value used to initialize bits [223:216] of the result.
  4127. /// \param __b03
  4128. ///    An 8-bit integral value used to initialize bits [231:224] of the result.
  4129. /// \param __b02
  4130. ///    An 8-bit integral value used to initialize bits [239:232] of the result.
  4131. /// \param __b01
  4132. ///    An 8-bit integral value used to initialize bits [247:240] of the result.
  4133. /// \param __b00
  4134. ///    An 8-bit integral value used to initialize bits [255:248] of the result.
  4135. /// \returns An initialized 256-bit integer vector.
  4136. static __inline __m256i __DEFAULT_FN_ATTRS
  4137. _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
  4138.                  char __b27, char __b26, char __b25, char __b24,
  4139.                  char __b23, char __b22, char __b21, char __b20,
  4140.                  char __b19, char __b18, char __b17, char __b16,
  4141.                  char __b15, char __b14, char __b13, char __b12,
  4142.                  char __b11, char __b10, char __b09, char __b08,
  4143.                  char __b07, char __b06, char __b05, char __b04,
  4144.                  char __b03, char __b02, char __b01, char __b00)
  4145. {
  4146.   return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
  4147.                          __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
  4148.                          __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
  4149.                          __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
  4150. }
  4151.  
  4152. /// Constructs a 256-bit integer vector, initialized in reverse order
  4153. ///    with the specified 64-bit integral values.
  4154. ///
  4155. /// \headerfile <x86intrin.h>
  4156. ///
  4157. /// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
  4158. ///   instruction.
  4159. ///
  4160. /// \param __a
  4161. ///    A 64-bit integral value used to initialize bits [63:0] of the result.
  4162. /// \param __b
  4163. ///    A 64-bit integral value used to initialize bits [127:64] of the result.
  4164. /// \param __c
  4165. ///    A 64-bit integral value used to initialize bits [191:128] of the result.
  4166. /// \param __d
  4167. ///    A 64-bit integral value used to initialize bits [255:192] of the result.
  4168. /// \returns An initialized 256-bit integer vector.
  4169. static __inline __m256i __DEFAULT_FN_ATTRS
  4170. _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
  4171. {
  4172.   return _mm256_set_epi64x(__d, __c, __b, __a);
  4173. }
  4174.  
  4175. /* Create vectors with repeated elements */
  4176. /// Constructs a 256-bit floating-point vector of [4 x double], with each
  4177. ///    of the four double-precision floating-point vector elements set to the
  4178. ///    specified double-precision floating-point value.
  4179. ///
  4180. /// \headerfile <x86intrin.h>
  4181. ///
  4182. /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
  4183. ///
  4184. /// \param __w
  4185. ///    A double-precision floating-point value used to initialize each vector
  4186. ///    element of the result.
  4187. /// \returns An initialized 256-bit floating-point vector of [4 x double].
  4188. static __inline __m256d __DEFAULT_FN_ATTRS
  4189. _mm256_set1_pd(double __w)
  4190. {
  4191.   return _mm256_set_pd(__w, __w, __w, __w);
  4192. }
  4193.  
  4194. /// Constructs a 256-bit floating-point vector of [8 x float], with each
  4195. ///    of the eight single-precision floating-point vector elements set to the
  4196. ///    specified single-precision floating-point value.
  4197. ///
  4198. /// \headerfile <x86intrin.h>
  4199. ///
  4200. /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
  4201. ///   instruction.
  4202. ///
  4203. /// \param __w
  4204. ///    A single-precision floating-point value used to initialize each vector
  4205. ///    element of the result.
  4206. /// \returns An initialized 256-bit floating-point vector of [8 x float].
  4207. static __inline __m256 __DEFAULT_FN_ATTRS
  4208. _mm256_set1_ps(float __w)
  4209. {
  4210.   return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
  4211. }
  4212.  
  4213. /// Constructs a 256-bit integer vector of [8 x i32], with each of the
  4214. ///    32-bit integral vector elements set to the specified 32-bit integral
  4215. ///    value.
  4216. ///
  4217. /// \headerfile <x86intrin.h>
  4218. ///
  4219. /// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
  4220. ///   instruction.
  4221. ///
  4222. /// \param __i
  4223. ///    A 32-bit integral value used to initialize each vector element of the
  4224. ///    result.
  4225. /// \returns An initialized 256-bit integer vector of [8 x i32].
  4226. static __inline __m256i __DEFAULT_FN_ATTRS
  4227. _mm256_set1_epi32(int __i)
  4228. {
  4229.   return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
  4230. }
  4231.  
  4232. /// Constructs a 256-bit integer vector of [16 x i16], with each of the
  4233. ///    16-bit integral vector elements set to the specified 16-bit integral
  4234. ///    value.
  4235. ///
  4236. /// \headerfile <x86intrin.h>
  4237. ///
  4238. /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
  4239. ///
  4240. /// \param __w
  4241. ///    A 16-bit integral value used to initialize each vector element of the
  4242. ///    result.
  4243. /// \returns An initialized 256-bit integer vector of [16 x i16].
  4244. static __inline __m256i __DEFAULT_FN_ATTRS
  4245. _mm256_set1_epi16(short __w)
  4246. {
  4247.   return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
  4248.                           __w, __w, __w, __w, __w, __w, __w, __w);
  4249. }
  4250.  
  4251. /// Constructs a 256-bit integer vector of [32 x i8], with each of the
  4252. ///    8-bit integral vector elements set to the specified 8-bit integral value.
  4253. ///
  4254. /// \headerfile <x86intrin.h>
  4255. ///
  4256. /// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
  4257. ///
  4258. /// \param __b
  4259. ///    An 8-bit integral value used to initialize each vector element of the
  4260. ///    result.
  4261. /// \returns An initialized 256-bit integer vector of [32 x i8].
  4262. static __inline __m256i __DEFAULT_FN_ATTRS
  4263. _mm256_set1_epi8(char __b)
  4264. {
  4265.   return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
  4266.                          __b, __b, __b, __b, __b, __b, __b, __b,
  4267.                          __b, __b, __b, __b, __b, __b, __b, __b,
  4268.                          __b, __b, __b, __b, __b, __b, __b, __b);
  4269. }
  4270.  
  4271. /// Constructs a 256-bit integer vector of [4 x i64], with each of the
  4272. ///    64-bit integral vector elements set to the specified 64-bit integral
  4273. ///    value.
  4274. ///
  4275. /// \headerfile <x86intrin.h>
  4276. ///
  4277. /// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
  4278. ///
  4279. /// \param __q
  4280. ///    A 64-bit integral value used to initialize each vector element of the
  4281. ///    result.
  4282. /// \returns An initialized 256-bit integer vector of [4 x i64].
  4283. static __inline __m256i __DEFAULT_FN_ATTRS
  4284. _mm256_set1_epi64x(long long __q)
  4285. {
  4286.   return _mm256_set_epi64x(__q, __q, __q, __q);
  4287. }
  4288.  
  4289. /* Create __zeroed vectors */
  4290. /// Constructs a 256-bit floating-point vector of [4 x double] with all
  4291. ///    vector elements initialized to zero.
  4292. ///
  4293. /// \headerfile <x86intrin.h>
  4294. ///
  4295. /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
  4296. ///
  4297. /// \returns A 256-bit vector of [4 x double] with all elements set to zero.
  4298. static __inline __m256d __DEFAULT_FN_ATTRS
  4299. _mm256_setzero_pd(void)
  4300. {
  4301.   return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
  4302. }
  4303.  
  4304. /// Constructs a 256-bit floating-point vector of [8 x float] with all
  4305. ///    vector elements initialized to zero.
  4306. ///
  4307. /// \headerfile <x86intrin.h>
  4308. ///
  4309. /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
  4310. ///
  4311. /// \returns A 256-bit vector of [8 x float] with all elements set to zero.
  4312. static __inline __m256 __DEFAULT_FN_ATTRS
  4313. _mm256_setzero_ps(void)
  4314. {
  4315.   return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
  4316. }
  4317.  
  4318. /// Constructs a 256-bit integer vector initialized to zero.
  4319. ///
  4320. /// \headerfile <x86intrin.h>
  4321. ///
  4322. /// This intrinsic corresponds to the <c> VXORPS </c> instruction.
  4323. ///
  4324. /// \returns A 256-bit integer vector initialized to zero.
  4325. static __inline __m256i __DEFAULT_FN_ATTRS
  4326. _mm256_setzero_si256(void)
  4327. {
  4328.   return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
  4329. }
  4330.  
  4331. /* Cast between vector types */
  4332. /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
  4333. ///    floating-point vector of [8 x float].
  4334. ///
  4335. /// \headerfile <x86intrin.h>
  4336. ///
  4337. /// This intrinsic has no corresponding instruction.
  4338. ///
  4339. /// \param __a
  4340. ///    A 256-bit floating-point vector of [4 x double].
  4341. /// \returns A 256-bit floating-point vector of [8 x float] containing the same
  4342. ///    bitwise pattern as the parameter.
  4343. static __inline __m256 __DEFAULT_FN_ATTRS
  4344. _mm256_castpd_ps(__m256d __a)
  4345. {
  4346.   return (__m256)__a;
  4347. }
  4348.  
  4349. /// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
  4350. ///    integer vector.
  4351. ///
  4352. /// \headerfile <x86intrin.h>
  4353. ///
  4354. /// This intrinsic has no corresponding instruction.
  4355. ///
  4356. /// \param __a
  4357. ///    A 256-bit floating-point vector of [4 x double].
  4358. /// \returns A 256-bit integer vector containing the same bitwise pattern as the
  4359. ///    parameter.
  4360. static __inline __m256i __DEFAULT_FN_ATTRS
  4361. _mm256_castpd_si256(__m256d __a)
  4362. {
  4363.   return (__m256i)__a;
  4364. }
  4365.  
  4366. /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
  4367. ///    floating-point vector of [4 x double].
  4368. ///
  4369. /// \headerfile <x86intrin.h>
  4370. ///
  4371. /// This intrinsic has no corresponding instruction.
  4372. ///
  4373. /// \param __a
  4374. ///    A 256-bit floating-point vector of [8 x float].
  4375. /// \returns A 256-bit floating-point vector of [4 x double] containing the same
  4376. ///    bitwise pattern as the parameter.
  4377. static __inline __m256d __DEFAULT_FN_ATTRS
  4378. _mm256_castps_pd(__m256 __a)
  4379. {
  4380.   return (__m256d)__a;
  4381. }
  4382.  
  4383. /// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
  4384. ///    integer vector.
  4385. ///
  4386. /// \headerfile <x86intrin.h>
  4387. ///
  4388. /// This intrinsic has no corresponding instruction.
  4389. ///
  4390. /// \param __a
  4391. ///    A 256-bit floating-point vector of [8 x float].
  4392. /// \returns A 256-bit integer vector containing the same bitwise pattern as the
  4393. ///    parameter.
  4394. static __inline __m256i __DEFAULT_FN_ATTRS
  4395. _mm256_castps_si256(__m256 __a)
  4396. {
  4397.   return (__m256i)__a;
  4398. }
  4399.  
  4400. /// Casts a 256-bit integer vector into a 256-bit floating-point vector
  4401. ///    of [8 x float].
  4402. ///
  4403. /// \headerfile <x86intrin.h>
  4404. ///
  4405. /// This intrinsic has no corresponding instruction.
  4406. ///
  4407. /// \param __a
  4408. ///    A 256-bit integer vector.
  4409. /// \returns A 256-bit floating-point vector of [8 x float] containing the same
  4410. ///    bitwise pattern as the parameter.
  4411. static __inline __m256 __DEFAULT_FN_ATTRS
  4412. _mm256_castsi256_ps(__m256i __a)
  4413. {
  4414.   return (__m256)__a;
  4415. }
  4416.  
  4417. /// Casts a 256-bit integer vector into a 256-bit floating-point vector
  4418. ///    of [4 x double].
  4419. ///
  4420. /// \headerfile <x86intrin.h>
  4421. ///
  4422. /// This intrinsic has no corresponding instruction.
  4423. ///
  4424. /// \param __a
  4425. ///    A 256-bit integer vector.
  4426. /// \returns A 256-bit floating-point vector of [4 x double] containing the same
  4427. ///    bitwise pattern as the parameter.
  4428. static __inline __m256d __DEFAULT_FN_ATTRS
  4429. _mm256_castsi256_pd(__m256i __a)
  4430. {
  4431.   return (__m256d)__a;
  4432. }
  4433.  
  4434. /// Returns the lower 128 bits of a 256-bit floating-point vector of
  4435. ///    [4 x double] as a 128-bit floating-point vector of [2 x double].
  4436. ///
  4437. /// \headerfile <x86intrin.h>
  4438. ///
  4439. /// This intrinsic has no corresponding instruction.
  4440. ///
  4441. /// \param __a
  4442. ///    A 256-bit floating-point vector of [4 x double].
  4443. /// \returns A 128-bit floating-point vector of [2 x double] containing the
  4444. ///    lower 128 bits of the parameter.
  4445. static __inline __m128d __DEFAULT_FN_ATTRS
  4446. _mm256_castpd256_pd128(__m256d __a)
  4447. {
  4448.   return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
  4449. }
  4450.  
  4451. /// Returns the lower 128 bits of a 256-bit floating-point vector of
  4452. ///    [8 x float] as a 128-bit floating-point vector of [4 x float].
  4453. ///
  4454. /// \headerfile <x86intrin.h>
  4455. ///
  4456. /// This intrinsic has no corresponding instruction.
  4457. ///
  4458. /// \param __a
  4459. ///    A 256-bit floating-point vector of [8 x float].
  4460. /// \returns A 128-bit floating-point vector of [4 x float] containing the
  4461. ///    lower 128 bits of the parameter.
  4462. static __inline __m128 __DEFAULT_FN_ATTRS
  4463. _mm256_castps256_ps128(__m256 __a)
  4464. {
  4465.   return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
  4466. }
  4467.  
  4468. /// Truncates a 256-bit integer vector into a 128-bit integer vector.
  4469. ///
  4470. /// \headerfile <x86intrin.h>
  4471. ///
  4472. /// This intrinsic has no corresponding instruction.
  4473. ///
  4474. /// \param __a
  4475. ///    A 256-bit integer vector.
  4476. /// \returns A 128-bit integer vector containing the lower 128 bits of the
  4477. ///    parameter.
  4478. static __inline __m128i __DEFAULT_FN_ATTRS
  4479. _mm256_castsi256_si128(__m256i __a)
  4480. {
  4481.   return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
  4482. }
  4483.  
  4484. /// Constructs a 256-bit floating-point vector of [4 x double] from a
  4485. ///    128-bit floating-point vector of [2 x double].
  4486. ///
  4487. ///    The lower 128 bits contain the value of the source vector. The contents
  4488. ///    of the upper 128 bits are undefined.
  4489. ///
  4490. /// \headerfile <x86intrin.h>
  4491. ///
  4492. /// This intrinsic has no corresponding instruction.
  4493. ///
  4494. /// \param __a
  4495. ///    A 128-bit vector of [2 x double].
  4496. /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
  4497. ///    contain the value of the parameter. The contents of the upper 128 bits
  4498. ///    are undefined.
  4499. static __inline __m256d __DEFAULT_FN_ATTRS
  4500. _mm256_castpd128_pd256(__m128d __a)
  4501. {
  4502.   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
  4503. }
  4504.  
  4505. /// Constructs a 256-bit floating-point vector of [8 x float] from a
  4506. ///    128-bit floating-point vector of [4 x float].
  4507. ///
  4508. ///    The lower 128 bits contain the value of the source vector. The contents
  4509. ///    of the upper 128 bits are undefined.
  4510. ///
  4511. /// \headerfile <x86intrin.h>
  4512. ///
  4513. /// This intrinsic has no corresponding instruction.
  4514. ///
  4515. /// \param __a
  4516. ///    A 128-bit vector of [4 x float].
  4517. /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
  4518. ///    contain the value of the parameter. The contents of the upper 128 bits
  4519. ///    are undefined.
  4520. static __inline __m256 __DEFAULT_FN_ATTRS
  4521. _mm256_castps128_ps256(__m128 __a)
  4522. {
  4523.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
  4524. }
  4525.  
  4526. /// Constructs a 256-bit integer vector from a 128-bit integer vector.
  4527. ///
  4528. ///    The lower 128 bits contain the value of the source vector. The contents
  4529. ///    of the upper 128 bits are undefined.
  4530. ///
  4531. /// \headerfile <x86intrin.h>
  4532. ///
  4533. /// This intrinsic has no corresponding instruction.
  4534. ///
  4535. /// \param __a
  4536. ///    A 128-bit integer vector.
  4537. /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
  4538. ///    the parameter. The contents of the upper 128 bits are undefined.
  4539. static __inline __m256i __DEFAULT_FN_ATTRS
  4540. _mm256_castsi128_si256(__m128i __a)
  4541. {
  4542.   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
  4543. }
  4544.  
  4545. /// Constructs a 256-bit floating-point vector of [4 x double] from a
  4546. ///    128-bit floating-point vector of [2 x double]. The lower 128 bits
  4547. ///    contain the value of the source vector. The upper 128 bits are set
  4548. ///    to zero.
  4549. ///
  4550. /// \headerfile <x86intrin.h>
  4551. ///
  4552. /// This intrinsic has no corresponding instruction.
  4553. ///
  4554. /// \param __a
  4555. ///    A 128-bit vector of [2 x double].
  4556. /// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
  4557. ///    contain the value of the parameter. The upper 128 bits are set to zero.
  4558. static __inline __m256d __DEFAULT_FN_ATTRS
  4559. _mm256_zextpd128_pd256(__m128d __a)
  4560. {
  4561.   return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
  4562. }
  4563.  
  4564. /// Constructs a 256-bit floating-point vector of [8 x float] from a
  4565. ///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
  4566. ///    the value of the source vector. The upper 128 bits are set to zero.
  4567. ///
  4568. /// \headerfile <x86intrin.h>
  4569. ///
  4570. /// This intrinsic has no corresponding instruction.
  4571. ///
  4572. /// \param __a
  4573. ///    A 128-bit vector of [4 x float].
  4574. /// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
  4575. ///    contain the value of the parameter. The upper 128 bits are set to zero.
  4576. static __inline __m256 __DEFAULT_FN_ATTRS
  4577. _mm256_zextps128_ps256(__m128 __a)
  4578. {
  4579.   return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
  4580. }
  4581.  
  4582. /// Constructs a 256-bit integer vector from a 128-bit integer vector.
  4583. ///    The lower 128 bits contain the value of the source vector. The upper
  4584. ///    128 bits are set to zero.
  4585. ///
  4586. /// \headerfile <x86intrin.h>
  4587. ///
  4588. /// This intrinsic has no corresponding instruction.
  4589. ///
  4590. /// \param __a
  4591. ///    A 128-bit integer vector.
  4592. /// \returns A 256-bit integer vector. The lower 128 bits contain the value of
  4593. ///    the parameter. The upper 128 bits are set to zero.
  4594. static __inline __m256i __DEFAULT_FN_ATTRS
  4595. _mm256_zextsi128_si256(__m128i __a)
  4596. {
  4597.   return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
  4598. }
  4599.  
  4600. /*
  4601.    Vector insert.
  4602.    We use macros rather than inlines because we only want to accept
  4603.    invocations where the immediate M is a constant expression.
  4604. */
  4605. /// Constructs a new 256-bit vector of [8 x float] by first duplicating
  4606. ///    a 256-bit vector of [8 x float] given in the first parameter, and then
  4607. ///    replacing either the upper or the lower 128 bits with the contents of a
  4608. ///    128-bit vector of [4 x float] in the second parameter.
  4609. ///
  4610. ///    The immediate integer parameter determines between the upper or the lower
  4611. ///    128 bits.
  4612. ///
  4613. /// \headerfile <x86intrin.h>
  4614. ///
  4615. /// \code
  4616. /// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
  4617. /// \endcode
  4618. ///
  4619. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4620. ///
  4621. /// \param V1
  4622. ///    A 256-bit vector of [8 x float]. This vector is copied to the result
  4623. ///    first, and then either the upper or the lower 128 bits of the result will
  4624. ///    be replaced by the contents of \a V2.
  4625. /// \param V2
  4626. ///    A 128-bit vector of [4 x float]. The contents of this parameter are
  4627. ///    written to either the upper or the lower 128 bits of the result depending
  4628. ///    on the value of parameter \a M.
  4629. /// \param M
  4630. ///    An immediate integer. The least significant bit determines how the values
  4631. ///    from the two parameters are interleaved: \n
  4632. ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
  4633. ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
  4634. ///    result. \n
  4635. ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
  4636. ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
  4637. ///    result.
  4638. /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
  4639. #define _mm256_insertf128_ps(V1, V2, M) \
  4640.   ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
  4641.                                             (__v4sf)(__m128)(V2), (int)(M)))
  4642.  
  4643. /// Constructs a new 256-bit vector of [4 x double] by first duplicating
  4644. ///    a 256-bit vector of [4 x double] given in the first parameter, and then
  4645. ///    replacing either the upper or the lower 128 bits with the contents of a
  4646. ///    128-bit vector of [2 x double] in the second parameter.
  4647. ///
  4648. ///    The immediate integer parameter determines between the upper or the lower
  4649. ///    128 bits.
  4650. ///
  4651. /// \headerfile <x86intrin.h>
  4652. ///
  4653. /// \code
  4654. /// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
  4655. /// \endcode
  4656. ///
  4657. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4658. ///
  4659. /// \param V1
  4660. ///    A 256-bit vector of [4 x double]. This vector is copied to the result
  4661. ///    first, and then either the upper or the lower 128 bits of the result will
  4662. ///    be replaced by the contents of \a V2.
  4663. /// \param V2
  4664. ///    A 128-bit vector of [2 x double]. The contents of this parameter are
  4665. ///    written to either the upper or the lower 128 bits of the result depending
  4666. ///    on the value of parameter \a M.
  4667. /// \param M
  4668. ///    An immediate integer. The least significant bit determines how the values
  4669. ///    from the two parameters are interleaved: \n
  4670. ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
  4671. ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
  4672. ///    result. \n
  4673. ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
  4674. ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
  4675. ///    result.
  4676. /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
  4677. #define _mm256_insertf128_pd(V1, V2, M) \
  4678.   ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
  4679.                                              (__v2df)(__m128d)(V2), (int)(M)))
  4680.  
  4681. /// Constructs a new 256-bit integer vector by first duplicating a
  4682. ///    256-bit integer vector given in the first parameter, and then replacing
  4683. ///    either the upper or the lower 128 bits with the contents of a 128-bit
  4684. ///    integer vector in the second parameter.
  4685. ///
  4686. ///    The immediate integer parameter determines between the upper or the lower
  4687. ///    128 bits.
  4688. ///
  4689. /// \headerfile <x86intrin.h>
  4690. ///
  4691. /// \code
  4692. /// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
  4693. /// \endcode
  4694. ///
  4695. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4696. ///
  4697. /// \param V1
  4698. ///    A 256-bit integer vector. This vector is copied to the result first, and
  4699. ///    then either the upper or the lower 128 bits of the result will be
  4700. ///    replaced by the contents of \a V2.
  4701. /// \param V2
  4702. ///    A 128-bit integer vector. The contents of this parameter are written to
  4703. ///    either the upper or the lower 128 bits of the result depending on the
  4704. ///     value of parameter \a M.
  4705. /// \param M
  4706. ///    An immediate integer. The least significant bit determines how the values
  4707. ///    from the two parameters are interleaved: \n
  4708. ///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
  4709. ///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
  4710. ///    result. \n
  4711. ///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
  4712. ///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
  4713. ///    result.
  4714. /// \returns A 256-bit integer vector containing the interleaved values.
  4715. #define _mm256_insertf128_si256(V1, V2, M) \
  4716.   ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
  4717.                                              (__v4si)(__m128i)(V2), (int)(M)))
  4718.  
  4719. /*
  4720.    Vector extract.
  4721.    We use macros rather than inlines because we only want to accept
  4722.    invocations where the immediate M is a constant expression.
  4723. */
  4724. /// Extracts either the upper or the lower 128 bits from a 256-bit vector
  4725. ///    of [8 x float], as determined by the immediate integer parameter, and
  4726. ///    returns the extracted bits as a 128-bit vector of [4 x float].
  4727. ///
  4728. /// \headerfile <x86intrin.h>
  4729. ///
  4730. /// \code
  4731. /// __m128 _mm256_extractf128_ps(__m256 V, const int M);
  4732. /// \endcode
  4733. ///
  4734. /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
  4735. ///
  4736. /// \param V
  4737. ///    A 256-bit vector of [8 x float].
  4738. /// \param M
  4739. ///    An immediate integer. The least significant bit determines which bits are
  4740. ///    extracted from the first parameter: \n
  4741. ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
  4742. ///    result. \n
  4743. ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
  4744. /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
  4745. #define _mm256_extractf128_ps(V, M) \
  4746.   ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
  4747.  
  4748. /// Extracts either the upper or the lower 128 bits from a 256-bit vector
  4749. ///    of [4 x double], as determined by the immediate integer parameter, and
  4750. ///    returns the extracted bits as a 128-bit vector of [2 x double].
  4751. ///
  4752. /// \headerfile <x86intrin.h>
  4753. ///
  4754. /// \code
  4755. /// __m128d _mm256_extractf128_pd(__m256d V, const int M);
  4756. /// \endcode
  4757. ///
  4758. /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
  4759. ///
  4760. /// \param V
  4761. ///    A 256-bit vector of [4 x double].
  4762. /// \param M
  4763. ///    An immediate integer. The least significant bit determines which bits are
  4764. ///    extracted from the first parameter: \n
  4765. ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
  4766. ///    result. \n
  4767. ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
  4768. /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
  4769. #define _mm256_extractf128_pd(V, M) \
  4770.   ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
  4771.  
  4772. /// Extracts either the upper or the lower 128 bits from a 256-bit
  4773. ///    integer vector, as determined by the immediate integer parameter, and
  4774. ///    returns the extracted bits as a 128-bit integer vector.
  4775. ///
  4776. /// \headerfile <x86intrin.h>
  4777. ///
  4778. /// \code
  4779. /// __m128i _mm256_extractf128_si256(__m256i V, const int M);
  4780. /// \endcode
  4781. ///
  4782. /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
  4783. ///
  4784. /// \param V
  4785. ///    A 256-bit integer vector.
  4786. /// \param M
  4787. ///    An immediate integer. The least significant bit determines which bits are
  4788. ///    extracted from the first parameter:  \n
  4789. ///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
  4790. ///    result. \n
  4791. ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
  4792. /// \returns A 128-bit integer vector containing the extracted bits.
  4793. #define _mm256_extractf128_si256(V, M) \
  4794.   ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
  4795.  
  4796. /// Constructs a 256-bit floating-point vector of [8 x float] by
  4797. ///    concatenating two 128-bit floating-point vectors of [4 x float].
  4798. ///
  4799. /// \headerfile <x86intrin.h>
  4800. ///
  4801. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4802. ///
  4803. /// \param __hi
  4804. ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
  4805. ///    128 bits of the result.
  4806. /// \param __lo
  4807. ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
  4808. ///    128 bits of the result.
  4809. /// \returns A 256-bit floating-point vector of [8 x float] containing the
  4810. ///    concatenated result.
  4811. static __inline __m256 __DEFAULT_FN_ATTRS
  4812. _mm256_set_m128 (__m128 __hi, __m128 __lo)
  4813. {
  4814.   return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
  4815. }
  4816.  
  4817. /// Constructs a 256-bit floating-point vector of [4 x double] by
  4818. ///    concatenating two 128-bit floating-point vectors of [2 x double].
  4819. ///
  4820. /// \headerfile <x86intrin.h>
  4821. ///
  4822. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4823. ///
  4824. /// \param __hi
  4825. ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
  4826. ///    128 bits of the result.
  4827. /// \param __lo
  4828. ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
  4829. ///    128 bits of the result.
  4830. /// \returns A 256-bit floating-point vector of [4 x double] containing the
  4831. ///    concatenated result.
  4832. static __inline __m256d __DEFAULT_FN_ATTRS
  4833. _mm256_set_m128d (__m128d __hi, __m128d __lo)
  4834. {
  4835.   return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
  4836. }
  4837.  
  4838. /// Constructs a 256-bit integer vector by concatenating two 128-bit
  4839. ///    integer vectors.
  4840. ///
  4841. /// \headerfile <x86intrin.h>
  4842. ///
  4843. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4844. ///
  4845. /// \param __hi
  4846. ///    A 128-bit integer vector to be copied to the upper 128 bits of the
  4847. ///    result.
  4848. /// \param __lo
  4849. ///    A 128-bit integer vector to be copied to the lower 128 bits of the
  4850. ///    result.
  4851. /// \returns A 256-bit integer vector containing the concatenated result.
  4852. static __inline __m256i __DEFAULT_FN_ATTRS
  4853. _mm256_set_m128i (__m128i __hi, __m128i __lo)
  4854. {
  4855.   return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
  4856. }
  4857.  
  4858. /// Constructs a 256-bit floating-point vector of [8 x float] by
  4859. ///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
  4860. ///    similar to _mm256_set_m128, but the order of the input parameters is
  4861. ///    swapped.
  4862. ///
  4863. /// \headerfile <x86intrin.h>
  4864. ///
  4865. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4866. ///
  4867. /// \param __lo
  4868. ///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
  4869. ///    128 bits of the result.
  4870. /// \param __hi
  4871. ///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
  4872. ///    128 bits of the result.
  4873. /// \returns A 256-bit floating-point vector of [8 x float] containing the
  4874. ///    concatenated result.
  4875. static __inline __m256 __DEFAULT_FN_ATTRS
  4876. _mm256_setr_m128 (__m128 __lo, __m128 __hi)
  4877. {
  4878.   return _mm256_set_m128(__hi, __lo);
  4879. }
  4880.  
  4881. /// Constructs a 256-bit floating-point vector of [4 x double] by
  4882. ///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
  4883. ///    similar to _mm256_set_m128d, but the order of the input parameters is
  4884. ///    swapped.
  4885. ///
  4886. /// \headerfile <x86intrin.h>
  4887. ///
  4888. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4889. ///
  4890. /// \param __lo
  4891. ///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
  4892. ///    128 bits of the result.
  4893. /// \param __hi
  4894. ///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
  4895. ///    128 bits of the result.
  4896. /// \returns A 256-bit floating-point vector of [4 x double] containing the
  4897. ///    concatenated result.
  4898. static __inline __m256d __DEFAULT_FN_ATTRS
  4899. _mm256_setr_m128d (__m128d __lo, __m128d __hi)
  4900. {
  4901.   return (__m256d)_mm256_set_m128d(__hi, __lo);
  4902. }
  4903.  
  4904. /// Constructs a 256-bit integer vector by concatenating two 128-bit
  4905. ///    integer vectors. This is similar to _mm256_set_m128i, but the order of
  4906. ///    the input parameters is swapped.
  4907. ///
  4908. /// \headerfile <x86intrin.h>
  4909. ///
  4910. /// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
  4911. ///
  4912. /// \param __lo
  4913. ///    A 128-bit integer vector to be copied to the lower 128 bits of the
  4914. ///    result.
  4915. /// \param __hi
  4916. ///    A 128-bit integer vector to be copied to the upper 128 bits of the
  4917. ///    result.
  4918. /// \returns A 256-bit integer vector containing the concatenated result.
  4919. static __inline __m256i __DEFAULT_FN_ATTRS
  4920. _mm256_setr_m128i (__m128i __lo, __m128i __hi)
  4921. {
  4922.   return (__m256i)_mm256_set_m128i(__hi, __lo);
  4923. }
  4924.  
  4925. /* SIMD load ops (unaligned) */
  4926. /// Loads two 128-bit floating-point vectors of [4 x float] from
  4927. ///    unaligned memory locations and constructs a 256-bit floating-point vector
  4928. ///    of [8 x float] by concatenating the two 128-bit vectors.
  4929. ///
  4930. /// \headerfile <x86intrin.h>
  4931. ///
  4932. /// This intrinsic corresponds to load instructions followed by the
  4933. ///   <c> VINSERTF128 </c> instruction.
  4934. ///
  4935. /// \param __addr_hi
  4936. ///    A pointer to a 128-bit memory location containing 4 consecutive
  4937. ///    single-precision floating-point values. These values are to be copied to
  4938. ///    bits[255:128] of the result. The address of the memory location does not
  4939. ///    have to be aligned.
  4940. /// \param __addr_lo
  4941. ///    A pointer to a 128-bit memory location containing 4 consecutive
  4942. ///    single-precision floating-point values. These values are to be copied to
  4943. ///    bits[127:0] of the result. The address of the memory location does not
  4944. ///    have to be aligned.
  4945. /// \returns A 256-bit floating-point vector of [8 x float] containing the
  4946. ///    concatenated result.
  4947. static __inline __m256 __DEFAULT_FN_ATTRS
  4948. _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
  4949. {
  4950.   return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
  4951. }
  4952.  
  4953. /// Loads two 128-bit floating-point vectors of [2 x double] from
  4954. ///    unaligned memory locations and constructs a 256-bit floating-point vector
  4955. ///    of [4 x double] by concatenating the two 128-bit vectors.
  4956. ///
  4957. /// \headerfile <x86intrin.h>
  4958. ///
  4959. /// This intrinsic corresponds to load instructions followed by the
  4960. ///   <c> VINSERTF128 </c> instruction.
  4961. ///
  4962. /// \param __addr_hi
  4963. ///    A pointer to a 128-bit memory location containing two consecutive
  4964. ///    double-precision floating-point values. These values are to be copied to
  4965. ///    bits[255:128] of the result. The address of the memory location does not
  4966. ///    have to be aligned.
  4967. /// \param __addr_lo
  4968. ///    A pointer to a 128-bit memory location containing two consecutive
  4969. ///    double-precision floating-point values. These values are to be copied to
  4970. ///    bits[127:0] of the result. The address of the memory location does not
  4971. ///    have to be aligned.
  4972. /// \returns A 256-bit floating-point vector of [4 x double] containing the
  4973. ///    concatenated result.
  4974. static __inline __m256d __DEFAULT_FN_ATTRS
  4975. _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
  4976. {
  4977.   return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
  4978. }
  4979.  
  4980. /// Loads two 128-bit integer vectors from unaligned memory locations and
  4981. ///    constructs a 256-bit integer vector by concatenating the two 128-bit
  4982. ///    vectors.
  4983. ///
  4984. /// \headerfile <x86intrin.h>
  4985. ///
  4986. /// This intrinsic corresponds to load instructions followed by the
  4987. ///   <c> VINSERTF128 </c> instruction.
  4988. ///
  4989. /// \param __addr_hi
  4990. ///    A pointer to a 128-bit memory location containing a 128-bit integer
  4991. ///    vector. This vector is to be copied to bits[255:128] of the result. The
  4992. ///    address of the memory location does not have to be aligned.
  4993. /// \param __addr_lo
  4994. ///    A pointer to a 128-bit memory location containing a 128-bit integer
  4995. ///    vector. This vector is to be copied to bits[127:0] of the result. The
  4996. ///    address of the memory location does not have to be aligned.
  4997. /// \returns A 256-bit integer vector containing the concatenated result.
  4998. static __inline __m256i __DEFAULT_FN_ATTRS
  4999. _mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
  5000. {
  5001.    return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
  5002. }
  5003.  
  5004. /* SIMD store ops (unaligned) */
  5005. /// Stores the upper and lower 128 bits of a 256-bit floating-point
  5006. ///    vector of [8 x float] into two different unaligned memory locations.
  5007. ///
  5008. /// \headerfile <x86intrin.h>
  5009. ///
  5010. /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
  5011. ///   store instructions.
  5012. ///
  5013. /// \param __addr_hi
  5014. ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
  5015. ///    copied to this memory location. The address of this memory location does
  5016. ///    not have to be aligned.
  5017. /// \param __addr_lo
  5018. ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
  5019. ///    copied to this memory location. The address of this memory location does
  5020. ///    not have to be aligned.
  5021. /// \param __a
  5022. ///    A 256-bit floating-point vector of [8 x float].
  5023. static __inline void __DEFAULT_FN_ATTRS
  5024. _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
  5025. {
  5026.   __m128 __v128;
  5027.  
  5028.   __v128 = _mm256_castps256_ps128(__a);
  5029.   _mm_storeu_ps(__addr_lo, __v128);
  5030.   __v128 = _mm256_extractf128_ps(__a, 1);
  5031.   _mm_storeu_ps(__addr_hi, __v128);
  5032. }
  5033.  
  5034. /// Stores the upper and lower 128 bits of a 256-bit floating-point
  5035. ///    vector of [4 x double] into two different unaligned memory locations.
  5036. ///
  5037. /// \headerfile <x86intrin.h>
  5038. ///
  5039. /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
  5040. ///   store instructions.
  5041. ///
  5042. /// \param __addr_hi
  5043. ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
  5044. ///    copied to this memory location. The address of this memory location does
  5045. ///    not have to be aligned.
  5046. /// \param __addr_lo
  5047. ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
  5048. ///    copied to this memory location. The address of this memory location does
  5049. ///    not have to be aligned.
  5050. /// \param __a
  5051. ///    A 256-bit floating-point vector of [4 x double].
  5052. static __inline void __DEFAULT_FN_ATTRS
  5053. _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
  5054. {
  5055.   __m128d __v128;
  5056.  
  5057.   __v128 = _mm256_castpd256_pd128(__a);
  5058.   _mm_storeu_pd(__addr_lo, __v128);
  5059.   __v128 = _mm256_extractf128_pd(__a, 1);
  5060.   _mm_storeu_pd(__addr_hi, __v128);
  5061. }
  5062.  
  5063. /// Stores the upper and lower 128 bits of a 256-bit integer vector into
  5064. ///    two different unaligned memory locations.
  5065. ///
  5066. /// \headerfile <x86intrin.h>
  5067. ///
  5068. /// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
  5069. ///   store instructions.
  5070. ///
  5071. /// \param __addr_hi
  5072. ///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
  5073. ///    copied to this memory location. The address of this memory location does
  5074. ///    not have to be aligned.
  5075. /// \param __addr_lo
  5076. ///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
  5077. ///    copied to this memory location. The address of this memory location does
  5078. ///    not have to be aligned.
  5079. /// \param __a
  5080. ///    A 256-bit integer vector.
  5081. static __inline void __DEFAULT_FN_ATTRS
  5082. _mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
  5083. {
  5084.   __m128i __v128;
  5085.  
  5086.   __v128 = _mm256_castsi256_si128(__a);
  5087.   _mm_storeu_si128(__addr_lo, __v128);
  5088.   __v128 = _mm256_extractf128_si256(__a, 1);
  5089.   _mm_storeu_si128(__addr_hi, __v128);
  5090. }
  5091.  
  5092. #undef __DEFAULT_FN_ATTRS
  5093. #undef __DEFAULT_FN_ATTRS128
  5094.  
  5095. #endif /* __AVXINTRIN_H */
  5096.