Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. /* Implemented from the specification included in the Intel C++ Compiler
  11.    User Guide and Reference, version 9.0.  */
  12.  
  13. #ifndef NO_WARN_X86_INTRINSICS
  14. /* This header file is to help porting code using Intel intrinsics
  15.    explicitly from x86_64 to powerpc64/powerpc64le.
  16.  
  17.    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
  18.    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
  19.    However scalar float operations in vector (XMM) registers require
  20.    the POWER8 VSX ISA (2.07) level. There are differences for data
  21.    format and placement of float scalars in the vector register, which
  22.    require extra steps to match SSE2 scalar float semantics on POWER.
  23.  
  24.    It should be noted that there's much difference between X86_64's
  25.    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
  26.    portable <fenv.h> instead of access MXSCR directly.
  27.  
  28.    Most SSE2 scalar float intrinsic operations can be performed more
  29.    efficiently as C language float scalar operations or optimized to
  30.    use vector SIMD operations. We recommend this for new applications.
  31. */
  32. #error                                                                         \
  33.     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  34. #endif
  35.  
  36. #ifndef EMMINTRIN_H_
  37. #define EMMINTRIN_H_
  38.  
  39. #if defined(__powerpc64__) &&                                                  \
  40.     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  41.  
  42. #include <altivec.h>
  43.  
  44. /* We need definitions from the SSE header files.  */
  45. #include <xmmintrin.h>
  46.  
  47. /* SSE2 */
  48. typedef __vector double __v2df;
  49. typedef __vector long long __v2di;
  50. typedef __vector unsigned long long __v2du;
  51. typedef __vector int __v4si;
  52. typedef __vector unsigned int __v4su;
  53. typedef __vector short __v8hi;
  54. typedef __vector unsigned short __v8hu;
  55. typedef __vector signed char __v16qi;
  56. typedef __vector unsigned char __v16qu;
  57.  
  58. /* The Intel API is flexible enough that we must allow aliasing with other
  59.    vector types, and their scalar components.  */
  60. typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
  61. typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
  62.  
  63. /* Unaligned version of the same types.  */
  64. typedef long long __m128i_u
  65.     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  66. typedef double __m128d_u
  67.     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  68.  
  69. /* Define two value permute mask.  */
  70. #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  71.  
  72. /* Create a vector with element 0 as F and the rest zero.  */
  73. extern __inline __m128d
  74.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  75.     _mm_set_sd(double __F) {
  76.   return __extension__(__m128d){__F, 0.0};
  77. }
  78.  
  79. /* Create a vector with both elements equal to F.  */
  80. extern __inline __m128d
  81.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  82.     _mm_set1_pd(double __F) {
  83.   return __extension__(__m128d){__F, __F};
  84. }
  85.  
  86. extern __inline __m128d
  87.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  88.     _mm_set_pd1(double __F) {
  89.   return _mm_set1_pd(__F);
  90. }
  91.  
  92. /* Create a vector with the lower value X and upper value W.  */
  93. extern __inline __m128d
  94.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  95.     _mm_set_pd(double __W, double __X) {
  96.   return __extension__(__m128d){__X, __W};
  97. }
  98.  
  99. /* Create a vector with the lower value W and upper value X.  */
  100. extern __inline __m128d
  101.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  102.     _mm_setr_pd(double __W, double __X) {
  103.   return __extension__(__m128d){__W, __X};
  104. }
  105.  
  106. /* Create an undefined vector.  */
  107. extern __inline __m128d
  108.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  109.     _mm_undefined_pd(void) {
  110.   __m128d __Y = __Y;
  111.   return __Y;
  112. }
  113.  
  114. /* Create a vector of zeros.  */
  115. extern __inline __m128d
  116.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  117.     _mm_setzero_pd(void) {
  118.   return (__m128d)vec_splats(0);
  119. }
  120.  
  121. /* Sets the low DPFP value of A from the low value of B.  */
  122. extern __inline __m128d
  123.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  124.     _mm_move_sd(__m128d __A, __m128d __B) {
  125.   __v2df __result = (__v2df)__A;
  126.   __result[0] = ((__v2df)__B)[0];
  127.   return (__m128d)__result;
  128. }
  129.  
  130. /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
  131. extern __inline __m128d
  132.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  133.     _mm_load_pd(double const *__P) {
  134.   return ((__m128d)vec_ld(0, (__v16qu *)__P));
  135. }
  136.  
  137. /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
  138. extern __inline __m128d
  139.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  140.     _mm_loadu_pd(double const *__P) {
  141.   return (vec_vsx_ld(0, __P));
  142. }
  143.  
  144. /* Create a vector with all two elements equal to *P.  */
  145. extern __inline __m128d
  146.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  147.     _mm_load1_pd(double const *__P) {
  148.   return (vec_splats(*__P));
  149. }
  150.  
  151. /* Create a vector with element 0 as *P and the rest zero.  */
  152. extern __inline __m128d
  153.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  154.     _mm_load_sd(double const *__P) {
  155.   return _mm_set_sd(*__P);
  156. }
  157.  
  158. extern __inline __m128d
  159.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  160.     _mm_load_pd1(double const *__P) {
  161.   return _mm_load1_pd(__P);
  162. }
  163.  
  164. /* Load two DPFP values in reverse order.  The address must be aligned.  */
  165. extern __inline __m128d
  166.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  167.     _mm_loadr_pd(double const *__P) {
  168.   __v2df __tmp = _mm_load_pd(__P);
  169.   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
  170. }
  171.  
  172. /* Store two DPFP values.  The address must be 16-byte aligned.  */
  173. extern __inline void
  174.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  175.     _mm_store_pd(double *__P, __m128d __A) {
  176.   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
  177. }
  178.  
  179. /* Store two DPFP values.  The address need not be 16-byte aligned.  */
  180. extern __inline void
  181.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  182.     _mm_storeu_pd(double *__P, __m128d __A) {
  183.   *(__m128d_u *)__P = __A;
  184. }
  185.  
  186. /* Stores the lower DPFP value.  */
  187. extern __inline void
  188.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  189.     _mm_store_sd(double *__P, __m128d __A) {
  190.   *__P = ((__v2df)__A)[0];
  191. }
  192.  
  193. extern __inline double
  194.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  195.     _mm_cvtsd_f64(__m128d __A) {
  196.   return ((__v2df)__A)[0];
  197. }
  198.  
  199. extern __inline void
  200.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  201.     _mm_storel_pd(double *__P, __m128d __A) {
  202.   _mm_store_sd(__P, __A);
  203. }
  204.  
  205. /* Stores the upper DPFP value.  */
  206. extern __inline void
  207.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  208.     _mm_storeh_pd(double *__P, __m128d __A) {
  209.   *__P = ((__v2df)__A)[1];
  210. }
  211. /* Store the lower DPFP value across two words.
  212.    The address must be 16-byte aligned.  */
  213. extern __inline void
  214.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  215.     _mm_store1_pd(double *__P, __m128d __A) {
  216.   _mm_store_pd(__P, vec_splat(__A, 0));
  217. }
  218.  
  219. extern __inline void
  220.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  221.     _mm_store_pd1(double *__P, __m128d __A) {
  222.   _mm_store1_pd(__P, __A);
  223. }
  224.  
  225. /* Store two DPFP values in reverse order.  The address must be aligned.  */
  226. extern __inline void
  227.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  228.     _mm_storer_pd(double *__P, __m128d __A) {
  229.   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
  230. }
  231.  
  232. /* Intel intrinsic.  */
  233. extern __inline long long
  234.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  235.     _mm_cvtsi128_si64(__m128i __A) {
  236.   return ((__v2di)__A)[0];
  237. }
  238.  
  239. /* Microsoft intrinsic.  */
  240. extern __inline long long
  241.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  242.     _mm_cvtsi128_si64x(__m128i __A) {
  243.   return ((__v2di)__A)[0];
  244. }
  245.  
  246. extern __inline __m128d
  247.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  248.     _mm_add_pd(__m128d __A, __m128d __B) {
  249.   return (__m128d)((__v2df)__A + (__v2df)__B);
  250. }
  251.  
  252. /* Add the lower double-precision (64-bit) floating-point element in
  253.    a and b, store the result in the lower element of dst, and copy
  254.    the upper element from a to the upper element of dst. */
  255. extern __inline __m128d
  256.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  257.     _mm_add_sd(__m128d __A, __m128d __B) {
  258.   __A[0] = __A[0] + __B[0];
  259.   return (__A);
  260. }
  261.  
  262. extern __inline __m128d
  263.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  264.     _mm_sub_pd(__m128d __A, __m128d __B) {
  265.   return (__m128d)((__v2df)__A - (__v2df)__B);
  266. }
  267.  
  268. extern __inline __m128d
  269.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  270.     _mm_sub_sd(__m128d __A, __m128d __B) {
  271.   __A[0] = __A[0] - __B[0];
  272.   return (__A);
  273. }
  274.  
  275. extern __inline __m128d
  276.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  277.     _mm_mul_pd(__m128d __A, __m128d __B) {
  278.   return (__m128d)((__v2df)__A * (__v2df)__B);
  279. }
  280.  
  281. extern __inline __m128d
  282.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  283.     _mm_mul_sd(__m128d __A, __m128d __B) {
  284.   __A[0] = __A[0] * __B[0];
  285.   return (__A);
  286. }
  287.  
  288. extern __inline __m128d
  289.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  290.     _mm_div_pd(__m128d __A, __m128d __B) {
  291.   return (__m128d)((__v2df)__A / (__v2df)__B);
  292. }
  293.  
  294. extern __inline __m128d
  295.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  296.     _mm_div_sd(__m128d __A, __m128d __B) {
  297.   __A[0] = __A[0] / __B[0];
  298.   return (__A);
  299. }
  300.  
  301. extern __inline __m128d
  302.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  303.     _mm_sqrt_pd(__m128d __A) {
  304.   return (vec_sqrt(__A));
  305. }
  306.  
  307. /* Return pair {sqrt (B[0]), A[1]}.  */
  308. extern __inline __m128d
  309.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  310.     _mm_sqrt_sd(__m128d __A, __m128d __B) {
  311.   __v2df __c;
  312.   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
  313.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  314. }
  315.  
  316. extern __inline __m128d
  317.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  318.     _mm_min_pd(__m128d __A, __m128d __B) {
  319.   return (vec_min(__A, __B));
  320. }
  321.  
  322. extern __inline __m128d
  323.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  324.     _mm_min_sd(__m128d __A, __m128d __B) {
  325.   __v2df __a, __b, __c;
  326.   __a = vec_splats(__A[0]);
  327.   __b = vec_splats(__B[0]);
  328.   __c = vec_min(__a, __b);
  329.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  330. }
  331.  
  332. extern __inline __m128d
  333.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  334.     _mm_max_pd(__m128d __A, __m128d __B) {
  335.   return (vec_max(__A, __B));
  336. }
  337.  
  338. extern __inline __m128d
  339.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  340.     _mm_max_sd(__m128d __A, __m128d __B) {
  341.   __v2df __a, __b, __c;
  342.   __a = vec_splats(__A[0]);
  343.   __b = vec_splats(__B[0]);
  344.   __c = vec_max(__a, __b);
  345.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  346. }
  347.  
  348. extern __inline __m128d
  349.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  350.     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
  351.   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
  352. }
  353.  
  354. extern __inline __m128d
  355.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  356.     _mm_cmplt_pd(__m128d __A, __m128d __B) {
  357.   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
  358. }
  359.  
  360. extern __inline __m128d
  361.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  362.     _mm_cmple_pd(__m128d __A, __m128d __B) {
  363.   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
  364. }
  365.  
  366. extern __inline __m128d
  367.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  368.     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
  369.   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
  370. }
  371.  
  372. extern __inline __m128d
  373.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  374.     _mm_cmpge_pd(__m128d __A, __m128d __B) {
  375.   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
  376. }
  377.  
  378. extern __inline __m128d
  379.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  380.     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
  381.   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
  382.   return ((__m128d)vec_nor(__temp, __temp));
  383. }
  384.  
  385. extern __inline __m128d
  386.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  387.     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
  388.   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
  389. }
  390.  
  391. extern __inline __m128d
  392.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  393.     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
  394.   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
  395. }
  396.  
  397. extern __inline __m128d
  398.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  399.     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
  400.   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
  401. }
  402.  
  403. extern __inline __m128d
  404.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  405.     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
  406.   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
  407. }
  408.  
  409. extern __inline __m128d
  410.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  411.     _mm_cmpord_pd(__m128d __A, __m128d __B) {
  412.   __v2du __c, __d;
  413.   /* Compare against self will return false (0's) if NAN.  */
  414.   __c = (__v2du)vec_cmpeq(__A, __A);
  415.   __d = (__v2du)vec_cmpeq(__B, __B);
  416.   /* A != NAN and B != NAN.  */
  417.   return ((__m128d)vec_and(__c, __d));
  418. }
  419.  
  420. extern __inline __m128d
  421.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  422.     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
  423. #if _ARCH_PWR8
  424.   __v2du __c, __d;
  425.   /* Compare against self will return false (0's) if NAN.  */
  426.   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
  427.   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
  428.   /* A == NAN OR B == NAN converts too:
  429.      NOT(A != NAN) OR NOT(B != NAN).  */
  430.   __c = vec_nor(__c, __c);
  431.   return ((__m128d)vec_orc(__c, __d));
  432. #else
  433.   __v2du __c, __d;
  434.   /* Compare against self will return false (0's) if NAN.  */
  435.   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
  436.   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
  437.   /* Convert the true ('1's) is NAN.  */
  438.   __c = vec_nor(__c, __c);
  439.   __d = vec_nor(__d, __d);
  440.   return ((__m128d)vec_or(__c, __d));
  441. #endif
  442. }
  443.  
  444. extern __inline __m128d
  445.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  446.     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
  447.   __v2df __a, __b, __c;
  448.   /* PowerISA VSX does not allow partial (for just lower double)
  449.      results. So to insure we don't generate spurious exceptions
  450.      (from the upper double values) we splat the lower double
  451.      before we do the operation. */
  452.   __a = vec_splats(__A[0]);
  453.   __b = vec_splats(__B[0]);
  454.   __c = (__v2df)vec_cmpeq(__a, __b);
  455.   /* Then we merge the lower double result with the original upper
  456.      double from __A.  */
  457.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  458. }
  459.  
  460. extern __inline __m128d
  461.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  462.     _mm_cmplt_sd(__m128d __A, __m128d __B) {
  463.   __v2df __a, __b, __c;
  464.   __a = vec_splats(__A[0]);
  465.   __b = vec_splats(__B[0]);
  466.   __c = (__v2df)vec_cmplt(__a, __b);
  467.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  468. }
  469.  
  470. extern __inline __m128d
  471.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  472.     _mm_cmple_sd(__m128d __A, __m128d __B) {
  473.   __v2df __a, __b, __c;
  474.   __a = vec_splats(__A[0]);
  475.   __b = vec_splats(__B[0]);
  476.   __c = (__v2df)vec_cmple(__a, __b);
  477.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  478. }
  479.  
  480. extern __inline __m128d
  481.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  482.     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
  483.   __v2df __a, __b, __c;
  484.   __a = vec_splats(__A[0]);
  485.   __b = vec_splats(__B[0]);
  486.   __c = (__v2df)vec_cmpgt(__a, __b);
  487.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  488. }
  489.  
  490. extern __inline __m128d
  491.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  492.     _mm_cmpge_sd(__m128d __A, __m128d __B) {
  493.   __v2df __a, __b, __c;
  494.   __a = vec_splats(__A[0]);
  495.   __b = vec_splats(__B[0]);
  496.   __c = (__v2df)vec_cmpge(__a, __b);
  497.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  498. }
  499.  
  500. extern __inline __m128d
  501.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  502.     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
  503.   __v2df __a, __b, __c;
  504.   __a = vec_splats(__A[0]);
  505.   __b = vec_splats(__B[0]);
  506.   __c = (__v2df)vec_cmpeq(__a, __b);
  507.   __c = vec_nor(__c, __c);
  508.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  509. }
  510.  
  511. extern __inline __m128d
  512.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  513.     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
  514.   __v2df __a, __b, __c;
  515.   __a = vec_splats(__A[0]);
  516.   __b = vec_splats(__B[0]);
  517.   /* Not less than is just greater than or equal.  */
  518.   __c = (__v2df)vec_cmpge(__a, __b);
  519.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  520. }
  521.  
  522. extern __inline __m128d
  523.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  524.     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
  525.   __v2df __a, __b, __c;
  526.   __a = vec_splats(__A[0]);
  527.   __b = vec_splats(__B[0]);
  528.   /* Not less than or equal is just greater than.  */
  529.   __c = (__v2df)vec_cmpge(__a, __b);
  530.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  531. }
  532.  
  533. extern __inline __m128d
  534.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  535.     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
  536.   __v2df __a, __b, __c;
  537.   __a = vec_splats(__A[0]);
  538.   __b = vec_splats(__B[0]);
  539.   /* Not greater than is just less than or equal.  */
  540.   __c = (__v2df)vec_cmple(__a, __b);
  541.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  542. }
  543.  
  544. extern __inline __m128d
  545.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  546.     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
  547.   __v2df __a, __b, __c;
  548.   __a = vec_splats(__A[0]);
  549.   __b = vec_splats(__B[0]);
  550.   /* Not greater than or equal is just less than.  */
  551.   __c = (__v2df)vec_cmplt(__a, __b);
  552.   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
  553. }
  554.  
  555. extern __inline __m128d
  556.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  557.     _mm_cmpord_sd(__m128d __A, __m128d __B) {
  558.   __v2df __r;
  559.   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
  560.   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
  561. }
  562.  
  563. extern __inline __m128d
  564.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  565.     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
  566.   __v2df __r;
  567.   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
  568.   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
  569. }
  570.  
  571. /* FIXME
  572.    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
  573.    exactly the same because GCC for PowerPC only generates unordered
  574.    compares (scalar and vector).
  575.    Technically __mm_comieq_sp et all should be using the ordered
  576.    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
  577.    be OK.   */
  578. extern __inline int
  579.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  580.     _mm_comieq_sd(__m128d __A, __m128d __B) {
  581.   return (__A[0] == __B[0]);
  582. }
  583.  
  584. extern __inline int
  585.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  586.     _mm_comilt_sd(__m128d __A, __m128d __B) {
  587.   return (__A[0] < __B[0]);
  588. }
  589.  
  590. extern __inline int
  591.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  592.     _mm_comile_sd(__m128d __A, __m128d __B) {
  593.   return (__A[0] <= __B[0]);
  594. }
  595.  
  596. extern __inline int
  597.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  598.     _mm_comigt_sd(__m128d __A, __m128d __B) {
  599.   return (__A[0] > __B[0]);
  600. }
  601.  
  602. extern __inline int
  603.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  604.     _mm_comige_sd(__m128d __A, __m128d __B) {
  605.   return (__A[0] >= __B[0]);
  606. }
  607.  
  608. extern __inline int
  609.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  610.     _mm_comineq_sd(__m128d __A, __m128d __B) {
  611.   return (__A[0] != __B[0]);
  612. }
  613.  
  614. extern __inline int
  615.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  616.     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
  617.   return (__A[0] == __B[0]);
  618. }
  619.  
  620. extern __inline int
  621.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  622.     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
  623.   return (__A[0] < __B[0]);
  624. }
  625.  
  626. extern __inline int
  627.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  628.     _mm_ucomile_sd(__m128d __A, __m128d __B) {
  629.   return (__A[0] <= __B[0]);
  630. }
  631.  
  632. extern __inline int
  633.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  634.     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
  635.   return (__A[0] > __B[0]);
  636. }
  637.  
  638. extern __inline int
  639.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  640.     _mm_ucomige_sd(__m128d __A, __m128d __B) {
  641.   return (__A[0] >= __B[0]);
  642. }
  643.  
  644. extern __inline int
  645.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  646.     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
  647.   return (__A[0] != __B[0]);
  648. }
  649.  
  650. /* Create a vector of Qi, where i is the element number.  */
  651. extern __inline __m128i
  652.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  653.     _mm_set_epi64x(long long __q1, long long __q0) {
  654.   return __extension__(__m128i)(__v2di){__q0, __q1};
  655. }
  656.  
  657. extern __inline __m128i
  658.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  659.     _mm_set_epi64(__m64 __q1, __m64 __q0) {
  660.   return _mm_set_epi64x((long long)__q1, (long long)__q0);
  661. }
  662.  
  663. extern __inline __m128i
  664.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  665.     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
  666.   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
  667. }
  668.  
  669. extern __inline __m128i
  670.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  671.     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
  672.                   short __q2, short __q1, short __q0) {
  673.   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
  674.                                         __q4, __q5, __q6, __q7};
  675. }
  676.  
  677. extern __inline __m128i
  678.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  679.     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
  680.                  char __q10, char __q09, char __q08, char __q07, char __q06,
  681.                  char __q05, char __q04, char __q03, char __q02, char __q01,
  682.                  char __q00) {
  683.   return __extension__(__m128i)(__v16qi){
  684.       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
  685.       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
  686. }
  687.  
  688. /* Set all of the elements of the vector to A.  */
  689. extern __inline __m128i
  690.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  691.     _mm_set1_epi64x(long long __A) {
  692.   return _mm_set_epi64x(__A, __A);
  693. }
  694.  
  695. extern __inline __m128i
  696.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  697.     _mm_set1_epi64(__m64 __A) {
  698.   return _mm_set_epi64(__A, __A);
  699. }
  700.  
  701. extern __inline __m128i
  702.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  703.     _mm_set1_epi32(int __A) {
  704.   return _mm_set_epi32(__A, __A, __A, __A);
  705. }
  706.  
  707. extern __inline __m128i
  708.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  709.     _mm_set1_epi16(short __A) {
  710.   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
  711. }
  712.  
  713. extern __inline __m128i
  714.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  715.     _mm_set1_epi8(char __A) {
  716.   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
  717.                       __A, __A, __A, __A, __A);
  718. }
  719.  
  720. /* Create a vector of Qi, where i is the element number.
  721.    The parameter order is reversed from the _mm_set_epi* functions.  */
  722. extern __inline __m128i
  723.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  724.     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
  725.   return _mm_set_epi64(__q1, __q0);
  726. }
  727.  
  728. extern __inline __m128i
  729.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  730.     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
  731.   return _mm_set_epi32(__q3, __q2, __q1, __q0);
  732. }
  733.  
  734. extern __inline __m128i
  735.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  736.     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
  737.                    short __q5, short __q6, short __q7) {
  738.   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
  739. }
  740.  
  741. extern __inline __m128i
  742.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  743.     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
  744.                   char __q05, char __q06, char __q07, char __q08, char __q09,
  745.                   char __q10, char __q11, char __q12, char __q13, char __q14,
  746.                   char __q15) {
  747.   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
  748.                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
  749. }
  750.  
  751. /* Create a vector with element 0 as *P and the rest zero.  */
  752. extern __inline __m128i
  753.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  754.     _mm_load_si128(__m128i const *__P) {
  755.   return *__P;
  756. }
  757.  
  758. extern __inline __m128i
  759.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  760.     _mm_loadu_si128(__m128i_u const *__P) {
  761.   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
  762. }
  763.  
  764. extern __inline __m128i
  765.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  766.     _mm_loadl_epi64(__m128i_u const *__P) {
  767.   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
  768. }
  769.  
  770. extern __inline void
  771.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  772.     _mm_store_si128(__m128i *__P, __m128i __B) {
  773.   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
  774. }
  775.  
  776. extern __inline void
  777.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  778.     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
  779.   *__P = __B;
  780. }
  781.  
  782. extern __inline void
  783.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  784.     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
  785.   *(long long *)__P = ((__v2di)__B)[0];
  786. }
  787.  
  788. extern __inline __m64
  789.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  790.     _mm_movepi64_pi64(__m128i_u __B) {
  791.   return (__m64)((__v2di)__B)[0];
  792. }
  793.  
  794. extern __inline __m128i
  795.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  796.     _mm_movpi64_epi64(__m64 __A) {
  797.   return _mm_set_epi64((__m64)0LL, __A);
  798. }
  799.  
  800. extern __inline __m128i
  801.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  802.     _mm_move_epi64(__m128i __A) {
  803.   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
  804. }
  805.  
  806. /* Create an undefined vector.  */
  807. extern __inline __m128i
  808.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  809.     _mm_undefined_si128(void) {
  810.   __m128i __Y = __Y;
  811.   return __Y;
  812. }
  813.  
  814. /* Create a vector of zeros.  */
  815. extern __inline __m128i
  816.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  817.     _mm_setzero_si128(void) {
  818.   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
  819. }
  820.  
  821. #ifdef _ARCH_PWR8
  822. extern __inline __m128d
  823.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  824.     _mm_cvtepi32_pd(__m128i __A) {
  825.   __v2di __val;
  826.   /* For LE need to generate Vector Unpack Low Signed Word.
  827.      Which is generated from unpackh.  */
  828.   __val = (__v2di)vec_unpackh((__v4si)__A);
  829.  
  830.   return (__m128d)vec_ctf(__val, 0);
  831. }
  832. #endif
  833.  
  834. extern __inline __m128
  835.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  836.     _mm_cvtepi32_ps(__m128i __A) {
  837.   return ((__m128)vec_ctf((__v4si)__A, 0));
  838. }
  839.  
  840. extern __inline __m128i
  841.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  842.     _mm_cvtpd_epi32(__m128d __A) {
  843.   __v2df __rounded = vec_rint(__A);
  844.   __v4si __result, __temp;
  845.   const __v4si __vzero = {0, 0, 0, 0};
  846.  
  847.   /* VSX Vector truncate Double-Precision to integer and Convert to
  848.    Signed Integer Word format with Saturate.  */
  849.   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
  850.  
  851. #ifdef _ARCH_PWR8
  852. #ifdef __LITTLE_ENDIAN__
  853.   __temp = vec_mergeo(__temp, __temp);
  854. #else
  855.   __temp = vec_mergee(__temp, __temp);
  856. #endif
  857.   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
  858.                                  (__vector long long)__vzero);
  859. #else
  860.   {
  861.     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
  862.                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
  863.     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
  864.   }
  865. #endif
  866.   return (__m128i)__result;
  867. }
  868.  
  869. extern __inline __m64
  870.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  871.     _mm_cvtpd_pi32(__m128d __A) {
  872.   __m128i __result = _mm_cvtpd_epi32(__A);
  873.  
  874.   return (__m64)__result[0];
  875. }
  876.  
  877. extern __inline __m128
  878.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  879.     _mm_cvtpd_ps(__m128d __A) {
  880.   __v4sf __result;
  881.   __v4si __temp;
  882.   const __v4si __vzero = {0, 0, 0, 0};
  883.  
  884.   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
  885.  
  886. #ifdef _ARCH_PWR8
  887. #ifdef __LITTLE_ENDIAN__
  888.   __temp = vec_mergeo(__temp, __temp);
  889. #else
  890.   __temp = vec_mergee(__temp, __temp);
  891. #endif
  892.   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
  893.                                  (__vector long long)__vzero);
  894. #else
  895.   {
  896.     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
  897.                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
  898.     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
  899.   }
  900. #endif
  901.   return ((__m128)__result);
  902. }
  903.  
  904. extern __inline __m128i
  905.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  906.     _mm_cvttpd_epi32(__m128d __A) {
  907.   __v4si __result;
  908.   __v4si __temp;
  909.   const __v4si __vzero = {0, 0, 0, 0};
  910.  
  911.   /* VSX Vector truncate Double-Precision to integer and Convert to
  912.    Signed Integer Word format with Saturate.  */
  913.   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
  914.  
  915. #ifdef _ARCH_PWR8
  916. #ifdef __LITTLE_ENDIAN__
  917.   __temp = vec_mergeo(__temp, __temp);
  918. #else
  919.   __temp = vec_mergee(__temp, __temp);
  920. #endif
  921.   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
  922.                                  (__vector long long)__vzero);
  923. #else
  924.   {
  925.     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
  926.                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
  927.     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
  928.   }
  929. #endif
  930.  
  931.   return ((__m128i)__result);
  932. }
  933.  
  934. extern __inline __m64
  935.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  936.     _mm_cvttpd_pi32(__m128d __A) {
  937.   __m128i __result = _mm_cvttpd_epi32(__A);
  938.  
  939.   return (__m64)__result[0];
  940. }
  941.  
  942. extern __inline int
  943.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  944.     _mm_cvtsi128_si32(__m128i __A) {
  945.   return ((__v4si)__A)[0];
  946. }
  947.  
  948. #ifdef _ARCH_PWR8
  949. extern __inline __m128d
  950.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  951.     _mm_cvtpi32_pd(__m64 __A) {
  952.   __v4si __temp;
  953.   __v2di __tmp2;
  954.   __v2df __result;
  955.  
  956.   __temp = (__v4si)vec_splats(__A);
  957.   __tmp2 = (__v2di)vec_unpackl(__temp);
  958.   __result = vec_ctf((__vector signed long long)__tmp2, 0);
  959.   return (__m128d)__result;
  960. }
  961. #endif
  962.  
  963. extern __inline __m128i
  964.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  965.     _mm_cvtps_epi32(__m128 __A) {
  966.   __v4sf __rounded;
  967.   __v4si __result;
  968.  
  969.   __rounded = vec_rint((__v4sf)__A);
  970.   __result = vec_cts(__rounded, 0);
  971.   return (__m128i)__result;
  972. }
  973.  
  974. extern __inline __m128i
  975.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  976.     _mm_cvttps_epi32(__m128 __A) {
  977.   __v4si __result;
  978.  
  979.   __result = vec_cts((__v4sf)__A, 0);
  980.   return (__m128i)__result;
  981. }
  982.  
  983. extern __inline __m128d
  984.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  985.     _mm_cvtps_pd(__m128 __A) {
  986.   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
  987. #ifdef vec_doubleh
  988.   return (__m128d)vec_doubleh((__v4sf)__A);
  989. #else
  990.   /* Otherwise the compiler is not current and so need to generate the
  991.      equivalent code.  */
  992.   __v4sf __a = (__v4sf)__A;
  993.   __v4sf __temp;
  994.   __v2df __result;
  995. #ifdef __LITTLE_ENDIAN__
  996.   /* The input float values are in elements {[0], [1]} but the convert
  997.      instruction needs them in elements {[1], [3]}, So we use two
  998.      shift left double vector word immediates to get the elements
  999.      lined up.  */
  1000.   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
  1001.   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
  1002. #else
  1003.   /* The input float values are in elements {[0], [1]} but the convert
  1004.      instruction needs them in elements {[0], [2]}, So we use two
  1005.      shift left double vector word immediates to get the elements
  1006.      lined up.  */
  1007.   __temp = vec_vmrghw(__a, __a);
  1008. #endif
  1009.   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
  1010.   return (__m128d)__result;
  1011. #endif
  1012. }
  1013.  
  1014. extern __inline int
  1015.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1016.     _mm_cvtsd_si32(__m128d __A) {
  1017.   __v2df __rounded = vec_rint((__v2df)__A);
  1018.   int __result = ((__v2df)__rounded)[0];
  1019.  
  1020.   return __result;
  1021. }
  1022. /* Intel intrinsic.  */
  1023. extern __inline long long
  1024.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1025.     _mm_cvtsd_si64(__m128d __A) {
  1026.   __v2df __rounded = vec_rint((__v2df)__A);
  1027.   long long __result = ((__v2df)__rounded)[0];
  1028.  
  1029.   return __result;
  1030. }
  1031.  
  1032. /* Microsoft intrinsic.  */
  1033. extern __inline long long
  1034.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1035.     _mm_cvtsd_si64x(__m128d __A) {
  1036.   return _mm_cvtsd_si64((__v2df)__A);
  1037. }
  1038.  
  1039. extern __inline int
  1040.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1041.     _mm_cvttsd_si32(__m128d __A) {
  1042.   int __result = ((__v2df)__A)[0];
  1043.  
  1044.   return __result;
  1045. }
  1046.  
  1047. /* Intel intrinsic.  */
  1048. extern __inline long long
  1049.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1050.     _mm_cvttsd_si64(__m128d __A) {
  1051.   long long __result = ((__v2df)__A)[0];
  1052.  
  1053.   return __result;
  1054. }
  1055.  
  1056. /* Microsoft intrinsic.  */
  1057. extern __inline long long
  1058.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1059.     _mm_cvttsd_si64x(__m128d __A) {
  1060.   return _mm_cvttsd_si64(__A);
  1061. }
  1062.  
  1063. extern __inline __m128
  1064.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1065.     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
  1066.   __v4sf __result = (__v4sf)__A;
  1067.  
  1068. #ifdef __LITTLE_ENDIAN__
  1069.   __v4sf __temp_s;
  1070.   /* Copy double element[0] to element [1] for conversion.  */
  1071.   __v2df __temp_b = vec_splat((__v2df)__B, 0);
  1072.  
  1073.   /* Pre-rotate __A left 3 (logically right 1) elements.  */
  1074.   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
  1075.   /* Convert double to single float scalar in a vector.  */
  1076.   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
  1077.   /* Shift the resulting scalar into vector element [0].  */
  1078.   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
  1079. #else
  1080.   __result[0] = ((__v2df)__B)[0];
  1081. #endif
  1082.   return (__m128)__result;
  1083. }
  1084.  
  1085. extern __inline __m128d
  1086.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1087.     _mm_cvtsi32_sd(__m128d __A, int __B) {
  1088.   __v2df __result = (__v2df)__A;
  1089.   double __db = __B;
  1090.   __result[0] = __db;
  1091.   return (__m128d)__result;
  1092. }
  1093.  
  1094. /* Intel intrinsic.  */
  1095. extern __inline __m128d
  1096.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1097.     _mm_cvtsi64_sd(__m128d __A, long long __B) {
  1098.   __v2df __result = (__v2df)__A;
  1099.   double __db = __B;
  1100.   __result[0] = __db;
  1101.   return (__m128d)__result;
  1102. }
  1103.  
  1104. /* Microsoft intrinsic.  */
  1105. extern __inline __m128d
  1106.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1107.     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
  1108.   return _mm_cvtsi64_sd(__A, __B);
  1109. }
  1110.  
  1111. extern __inline __m128d
  1112.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1113.     _mm_cvtss_sd(__m128d __A, __m128 __B) {
  1114. #ifdef __LITTLE_ENDIAN__
  1115.   /* Use splat to move element [0] into position for the convert. */
  1116.   __v4sf __temp = vec_splat((__v4sf)__B, 0);
  1117.   __v2df __res;
  1118.   /* Convert single float scalar to double in a vector.  */
  1119.   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
  1120.   return (__m128d)vec_mergel(__res, (__v2df)__A);
  1121. #else
  1122.   __v2df __res = (__v2df)__A;
  1123.   __res[0] = ((__v4sf)__B)[0];
  1124.   return (__m128d)__res;
  1125. #endif
  1126. }
  1127.  
  1128. extern __inline __m128d
  1129.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1130.     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
  1131.   __vector double __result;
  1132.   const int __litmsk = __mask & 0x3;
  1133.  
  1134.   if (__litmsk == 0)
  1135.     __result = vec_mergeh(__A, __B);
  1136. #if __GNUC__ < 6
  1137.   else if (__litmsk == 1)
  1138.     __result = vec_xxpermdi(__B, __A, 2);
  1139.   else if (__litmsk == 2)
  1140.     __result = vec_xxpermdi(__B, __A, 1);
  1141. #else
  1142.   else if (__litmsk == 1)
  1143.     __result = vec_xxpermdi(__A, __B, 2);
  1144.   else if (__litmsk == 2)
  1145.     __result = vec_xxpermdi(__A, __B, 1);
  1146. #endif
  1147.   else
  1148.     __result = vec_mergel(__A, __B);
  1149.  
  1150.   return __result;
  1151. }
  1152.  
  1153. extern __inline __m128d
  1154.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1155.     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
  1156.   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
  1157. }
  1158.  
  1159. extern __inline __m128d
  1160.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1161.     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
  1162.   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
  1163. }
  1164.  
  1165. extern __inline __m128d
  1166.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1167.     _mm_loadh_pd(__m128d __A, double const *__B) {
  1168.   __v2df __result = (__v2df)__A;
  1169.   __result[1] = *__B;
  1170.   return (__m128d)__result;
  1171. }
  1172.  
  1173. extern __inline __m128d
  1174.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1175.     _mm_loadl_pd(__m128d __A, double const *__B) {
  1176.   __v2df __result = (__v2df)__A;
  1177.   __result[0] = *__B;
  1178.   return (__m128d)__result;
  1179. }
  1180.  
  1181. #ifdef _ARCH_PWR8
  1182. /* Intrinsic functions that require PowerISA 2.07 minimum.  */
  1183.  
  1184. /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
  1185. extern __inline int
  1186.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1187.     _mm_movemask_pd(__m128d __A) {
  1188. #ifdef _ARCH_PWR10
  1189.   return vec_extractm((__v2du)__A);
  1190. #else
  1191.   __vector unsigned long long __result;
  1192.   static const __vector unsigned int __perm_mask = {
  1193. #ifdef __LITTLE_ENDIAN__
  1194.       0x80800040, 0x80808080, 0x80808080, 0x80808080
  1195. #else
  1196.       0x80808080, 0x80808080, 0x80808080, 0x80804000
  1197. #endif
  1198.   };
  1199.  
  1200.   __result = ((__vector unsigned long long)vec_vbpermq(
  1201.       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
  1202.  
  1203. #ifdef __LITTLE_ENDIAN__
  1204.   return __result[1];
  1205. #else
  1206.   return __result[0];
  1207. #endif
  1208. #endif /* !_ARCH_PWR10 */
  1209. }
  1210. #endif /* _ARCH_PWR8 */
  1211.  
  1212. extern __inline __m128i
  1213.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1214.     _mm_packs_epi16(__m128i __A, __m128i __B) {
  1215.   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
  1216. }
  1217.  
  1218. extern __inline __m128i
  1219.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1220.     _mm_packs_epi32(__m128i __A, __m128i __B) {
  1221.   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
  1222. }
  1223.  
  1224. extern __inline __m128i
  1225.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1226.     _mm_packus_epi16(__m128i __A, __m128i __B) {
  1227.   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
  1228. }
  1229.  
  1230. extern __inline __m128i
  1231.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1232.     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
  1233.   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
  1234. }
  1235.  
  1236. extern __inline __m128i
  1237.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1238.     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
  1239.   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
  1240. }
  1241.  
  1242. extern __inline __m128i
  1243.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1244.     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
  1245.   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
  1246. }
  1247.  
  1248. extern __inline __m128i
  1249.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1250.     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
  1251.   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
  1252. }
  1253.  
  1254. extern __inline __m128i
  1255.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1256.     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
  1257.   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
  1258. }
  1259.  
  1260. extern __inline __m128i
  1261.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1262.     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
  1263.   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
  1264. }
  1265.  
  1266. extern __inline __m128i
  1267.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1268.     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
  1269.   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
  1270. }
  1271.  
  1272. extern __inline __m128i
  1273.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1274.     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
  1275.   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
  1276. }
  1277.  
  1278. extern __inline __m128i
  1279.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1280.     _mm_add_epi8(__m128i __A, __m128i __B) {
  1281.   return (__m128i)((__v16qu)__A + (__v16qu)__B);
  1282. }
  1283.  
  1284. extern __inline __m128i
  1285.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1286.     _mm_add_epi16(__m128i __A, __m128i __B) {
  1287.   return (__m128i)((__v8hu)__A + (__v8hu)__B);
  1288. }
  1289.  
  1290. extern __inline __m128i
  1291.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1292.     _mm_add_epi32(__m128i __A, __m128i __B) {
  1293.   return (__m128i)((__v4su)__A + (__v4su)__B);
  1294. }
  1295.  
  1296. extern __inline __m128i
  1297.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1298.     _mm_add_epi64(__m128i __A, __m128i __B) {
  1299.   return (__m128i)((__v2du)__A + (__v2du)__B);
  1300. }
  1301.  
  1302. extern __inline __m128i
  1303.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1304.     _mm_adds_epi8(__m128i __A, __m128i __B) {
  1305.   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
  1306. }
  1307.  
  1308. extern __inline __m128i
  1309.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1310.     _mm_adds_epi16(__m128i __A, __m128i __B) {
  1311.   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
  1312. }
  1313.  
  1314. extern __inline __m128i
  1315.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1316.     _mm_adds_epu8(__m128i __A, __m128i __B) {
  1317.   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
  1318. }
  1319.  
  1320. extern __inline __m128i
  1321.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1322.     _mm_adds_epu16(__m128i __A, __m128i __B) {
  1323.   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
  1324. }
  1325.  
  1326. extern __inline __m128i
  1327.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1328.     _mm_sub_epi8(__m128i __A, __m128i __B) {
  1329.   return (__m128i)((__v16qu)__A - (__v16qu)__B);
  1330. }
  1331.  
  1332. extern __inline __m128i
  1333.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1334.     _mm_sub_epi16(__m128i __A, __m128i __B) {
  1335.   return (__m128i)((__v8hu)__A - (__v8hu)__B);
  1336. }
  1337.  
  1338. extern __inline __m128i
  1339.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1340.     _mm_sub_epi32(__m128i __A, __m128i __B) {
  1341.   return (__m128i)((__v4su)__A - (__v4su)__B);
  1342. }
  1343.  
  1344. extern __inline __m128i
  1345.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1346.     _mm_sub_epi64(__m128i __A, __m128i __B) {
  1347.   return (__m128i)((__v2du)__A - (__v2du)__B);
  1348. }
  1349.  
  1350. extern __inline __m128i
  1351.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1352.     _mm_subs_epi8(__m128i __A, __m128i __B) {
  1353.   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
  1354. }
  1355.  
  1356. extern __inline __m128i
  1357.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1358.     _mm_subs_epi16(__m128i __A, __m128i __B) {
  1359.   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
  1360. }
  1361.  
  1362. extern __inline __m128i
  1363.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1364.     _mm_subs_epu8(__m128i __A, __m128i __B) {
  1365.   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
  1366. }
  1367.  
  1368. extern __inline __m128i
  1369.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1370.     _mm_subs_epu16(__m128i __A, __m128i __B) {
  1371.   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
  1372. }
  1373.  
  1374. extern __inline __m128i
  1375.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1376.     _mm_madd_epi16(__m128i __A, __m128i __B) {
  1377.   __vector signed int __zero = {0, 0, 0, 0};
  1378.  
  1379.   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
  1380. }
  1381.  
  1382. extern __inline __m128i
  1383.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1384.     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
  1385.   __vector signed int __w0, __w1;
  1386.  
  1387.   __vector unsigned char __xform1 = {
  1388. #ifdef __LITTLE_ENDIAN__
  1389.       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
  1390.       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
  1391. #else
  1392.       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
  1393.       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
  1394. #endif
  1395.   };
  1396.  
  1397.   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
  1398.   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
  1399.   return (__m128i)vec_perm(__w0, __w1, __xform1);
  1400. }
  1401.  
  1402. extern __inline __m128i
  1403.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1404.     _mm_mullo_epi16(__m128i __A, __m128i __B) {
  1405.   return (__m128i)((__v8hi)__A * (__v8hi)__B);
  1406. }
  1407.  
  1408. extern __inline __m64
  1409.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1410.     _mm_mul_su32(__m64 __A, __m64 __B) {
  1411.   unsigned int __a = __A;
  1412.   unsigned int __b = __B;
  1413.  
  1414.   return ((__m64)__a * (__m64)__b);
  1415. }
  1416.  
  1417. #ifdef _ARCH_PWR8
  1418. extern __inline __m128i
  1419.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1420.     _mm_mul_epu32(__m128i __A, __m128i __B) {
  1421. #if __GNUC__ < 8
  1422.   __v2du __result;
  1423.  
  1424. #ifdef __LITTLE_ENDIAN__
  1425.   /* VMX Vector Multiply Odd Unsigned Word.  */
  1426.   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
  1427. #else
  1428.   /* VMX Vector Multiply Even Unsigned Word.  */
  1429.   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
  1430. #endif
  1431.   return (__m128i)__result;
  1432. #else
  1433.   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
  1434. #endif
  1435. }
  1436. #endif
  1437.  
  1438. extern __inline __m128i
  1439.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1440.     _mm_slli_epi16(__m128i __A, int __B) {
  1441.   __v8hu __lshift;
  1442.   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
  1443.  
  1444.   if (__B >= 0 && __B < 16) {
  1445.     if (__builtin_constant_p(__B))
  1446.       __lshift = (__v8hu)vec_splat_s16(__B);
  1447.     else
  1448.       __lshift = vec_splats((unsigned short)__B);
  1449.  
  1450.     __result = vec_sl((__v8hi)__A, __lshift);
  1451.   }
  1452.  
  1453.   return (__m128i)__result;
  1454. }
  1455.  
  1456. extern __inline __m128i
  1457.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1458.     _mm_slli_epi32(__m128i __A, int __B) {
  1459.   __v4su __lshift;
  1460.   __v4si __result = {0, 0, 0, 0};
  1461.  
  1462.   if (__B >= 0 && __B < 32) {
  1463.     if (__builtin_constant_p(__B) && __B < 16)
  1464.       __lshift = (__v4su)vec_splat_s32(__B);
  1465.     else
  1466.       __lshift = vec_splats((unsigned int)__B);
  1467.  
  1468.     __result = vec_sl((__v4si)__A, __lshift);
  1469.   }
  1470.  
  1471.   return (__m128i)__result;
  1472. }
  1473.  
  1474. #ifdef _ARCH_PWR8
  1475. extern __inline __m128i
  1476.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1477.     _mm_slli_epi64(__m128i __A, int __B) {
  1478.   __v2du __lshift;
  1479.   __v2di __result = {0, 0};
  1480.  
  1481.   if (__B >= 0 && __B < 64) {
  1482.     if (__builtin_constant_p(__B) && __B < 16)
  1483.       __lshift = (__v2du)vec_splat_s32(__B);
  1484.     else
  1485.       __lshift = (__v2du)vec_splats((unsigned int)__B);
  1486.  
  1487.     __result = vec_sl((__v2di)__A, __lshift);
  1488.   }
  1489.  
  1490.   return (__m128i)__result;
  1491. }
  1492. #endif
  1493.  
  1494. extern __inline __m128i
  1495.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1496.     _mm_srai_epi16(__m128i __A, int __B) {
  1497.   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
  1498.   __v8hi __result;
  1499.  
  1500.   if (__B < 16) {
  1501.     if (__builtin_constant_p(__B))
  1502.       __rshift = (__v8hu)vec_splat_s16(__B);
  1503.     else
  1504.       __rshift = vec_splats((unsigned short)__B);
  1505.   }
  1506.   __result = vec_sra((__v8hi)__A, __rshift);
  1507.  
  1508.   return (__m128i)__result;
  1509. }
  1510.  
  1511. extern __inline __m128i
  1512.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1513.     _mm_srai_epi32(__m128i __A, int __B) {
  1514.   __v4su __rshift = {31, 31, 31, 31};
  1515.   __v4si __result;
  1516.  
  1517.   if (__B < 32) {
  1518.     if (__builtin_constant_p(__B)) {
  1519.       if (__B < 16)
  1520.         __rshift = (__v4su)vec_splat_s32(__B);
  1521.       else
  1522.         __rshift = (__v4su)vec_splats((unsigned int)__B);
  1523.     } else
  1524.       __rshift = vec_splats((unsigned int)__B);
  1525.   }
  1526.   __result = vec_sra((__v4si)__A, __rshift);
  1527.  
  1528.   return (__m128i)__result;
  1529. }
  1530.  
  1531. extern __inline __m128i
  1532.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1533.     _mm_bslli_si128(__m128i __A, const int __N) {
  1534.   __v16qu __result;
  1535.   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  1536.  
  1537.   if (__N < 16)
  1538.     __result = vec_sld((__v16qu)__A, __zeros, __N);
  1539.   else
  1540.     __result = __zeros;
  1541.  
  1542.   return (__m128i)__result;
  1543. }
  1544.  
  1545. extern __inline __m128i
  1546.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1547.     _mm_bsrli_si128(__m128i __A, const int __N) {
  1548.   __v16qu __result;
  1549.   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  1550.  
  1551.   if (__N < 16)
  1552. #ifdef __LITTLE_ENDIAN__
  1553.     if (__builtin_constant_p(__N))
  1554.       /* Would like to use Vector Shift Left Double by Octet
  1555.          Immediate here to use the immediate form and avoid
  1556.          load of __N * 8 value into a separate VR.  */
  1557.       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
  1558.     else
  1559. #endif
  1560.     {
  1561.       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
  1562. #ifdef __LITTLE_ENDIAN__
  1563.       __result = vec_sro((__v16qu)__A, __shift);
  1564. #else
  1565.     __result = vec_slo((__v16qu)__A, __shift);
  1566. #endif
  1567.     }
  1568.   else
  1569.     __result = __zeros;
  1570.  
  1571.   return (__m128i)__result;
  1572. }
  1573.  
  1574. extern __inline __m128i
  1575.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1576.     _mm_srli_si128(__m128i __A, const int __N) {
  1577.   return _mm_bsrli_si128(__A, __N);
  1578. }
  1579.  
  1580. extern __inline __m128i
  1581.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1582.     _mm_slli_si128(__m128i __A, const int _imm5) {
  1583.   __v16qu __result;
  1584.   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  1585.  
  1586.   if (_imm5 < 16)
  1587. #ifdef __LITTLE_ENDIAN__
  1588.     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
  1589. #else
  1590.     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
  1591. #endif
  1592.   else
  1593.     __result = __zeros;
  1594.  
  1595.   return (__m128i)__result;
  1596. }
  1597.  
  1598. extern __inline __m128i
  1599.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1600.  
  1601.     _mm_srli_epi16(__m128i __A, int __B) {
  1602.   __v8hu __rshift;
  1603.   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
  1604.  
  1605.   if (__B < 16) {
  1606.     if (__builtin_constant_p(__B))
  1607.       __rshift = (__v8hu)vec_splat_s16(__B);
  1608.     else
  1609.       __rshift = vec_splats((unsigned short)__B);
  1610.  
  1611.     __result = vec_sr((__v8hi)__A, __rshift);
  1612.   }
  1613.  
  1614.   return (__m128i)__result;
  1615. }
  1616.  
  1617. extern __inline __m128i
  1618.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1619.     _mm_srli_epi32(__m128i __A, int __B) {
  1620.   __v4su __rshift;
  1621.   __v4si __result = {0, 0, 0, 0};
  1622.  
  1623.   if (__B < 32) {
  1624.     if (__builtin_constant_p(__B)) {
  1625.       if (__B < 16)
  1626.         __rshift = (__v4su)vec_splat_s32(__B);
  1627.       else
  1628.         __rshift = (__v4su)vec_splats((unsigned int)__B);
  1629.     } else
  1630.       __rshift = vec_splats((unsigned int)__B);
  1631.  
  1632.     __result = vec_sr((__v4si)__A, __rshift);
  1633.   }
  1634.  
  1635.   return (__m128i)__result;
  1636. }
  1637.  
  1638. #ifdef _ARCH_PWR8
  1639. extern __inline __m128i
  1640.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1641.     _mm_srli_epi64(__m128i __A, int __B) {
  1642.   __v2du __rshift;
  1643.   __v2di __result = {0, 0};
  1644.  
  1645.   if (__B < 64) {
  1646.     if (__builtin_constant_p(__B)) {
  1647.       if (__B < 16)
  1648.         __rshift = (__v2du)vec_splat_s32(__B);
  1649.       else
  1650.         __rshift = (__v2du)vec_splats((unsigned long long)__B);
  1651.     } else
  1652.       __rshift = (__v2du)vec_splats((unsigned int)__B);
  1653.  
  1654.     __result = vec_sr((__v2di)__A, __rshift);
  1655.   }
  1656.  
  1657.   return (__m128i)__result;
  1658. }
  1659. #endif
  1660.  
  1661. extern __inline __m128i
  1662.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1663.     _mm_sll_epi16(__m128i __A, __m128i __B) {
  1664.   __v8hu __lshift;
  1665.   __vector __bool short __shmask;
  1666.   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
  1667.   __v8hu __result;
  1668.  
  1669. #ifdef __LITTLE_ENDIAN__
  1670.   __lshift = vec_splat((__v8hu)__B, 0);
  1671. #else
  1672.   __lshift = vec_splat((__v8hu)__B, 3);
  1673. #endif
  1674.   __shmask = vec_cmple(__lshift, __shmax);
  1675.   __result = vec_sl((__v8hu)__A, __lshift);
  1676.   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
  1677.  
  1678.   return (__m128i)__result;
  1679. }
  1680.  
  1681. extern __inline __m128i
  1682.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1683.     _mm_sll_epi32(__m128i __A, __m128i __B) {
  1684.   __v4su __lshift;
  1685.   __vector __bool int __shmask;
  1686.   const __v4su __shmax = {32, 32, 32, 32};
  1687.   __v4su __result;
  1688. #ifdef __LITTLE_ENDIAN__
  1689.   __lshift = vec_splat((__v4su)__B, 0);
  1690. #else
  1691.   __lshift = vec_splat((__v4su)__B, 1);
  1692. #endif
  1693.   __shmask = vec_cmplt(__lshift, __shmax);
  1694.   __result = vec_sl((__v4su)__A, __lshift);
  1695.   __result = vec_sel((__v4su)__shmask, __result, __shmask);
  1696.  
  1697.   return (__m128i)__result;
  1698. }
  1699.  
  1700. #ifdef _ARCH_PWR8
  1701. extern __inline __m128i
  1702.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1703.     _mm_sll_epi64(__m128i __A, __m128i __B) {
  1704.   __v2du __lshift;
  1705.   __vector __bool long long __shmask;
  1706.   const __v2du __shmax = {64, 64};
  1707.   __v2du __result;
  1708.  
  1709.   __lshift = vec_splat((__v2du)__B, 0);
  1710.   __shmask = vec_cmplt(__lshift, __shmax);
  1711.   __result = vec_sl((__v2du)__A, __lshift);
  1712.   __result = vec_sel((__v2du)__shmask, __result, __shmask);
  1713.  
  1714.   return (__m128i)__result;
  1715. }
  1716. #endif
  1717.  
  1718. extern __inline __m128i
  1719.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1720.     _mm_sra_epi16(__m128i __A, __m128i __B) {
  1721.   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
  1722.   __v8hu __rshift;
  1723.   __v8hi __result;
  1724.  
  1725. #ifdef __LITTLE_ENDIAN__
  1726.   __rshift = vec_splat((__v8hu)__B, 0);
  1727. #else
  1728.   __rshift = vec_splat((__v8hu)__B, 3);
  1729. #endif
  1730.   __rshift = vec_min(__rshift, __rshmax);
  1731.   __result = vec_sra((__v8hi)__A, __rshift);
  1732.  
  1733.   return (__m128i)__result;
  1734. }
  1735.  
  1736. extern __inline __m128i
  1737.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1738.     _mm_sra_epi32(__m128i __A, __m128i __B) {
  1739.   const __v4su __rshmax = {31, 31, 31, 31};
  1740.   __v4su __rshift;
  1741.   __v4si __result;
  1742.  
  1743. #ifdef __LITTLE_ENDIAN__
  1744.   __rshift = vec_splat((__v4su)__B, 0);
  1745. #else
  1746.   __rshift = vec_splat((__v4su)__B, 1);
  1747. #endif
  1748.   __rshift = vec_min(__rshift, __rshmax);
  1749.   __result = vec_sra((__v4si)__A, __rshift);
  1750.  
  1751.   return (__m128i)__result;
  1752. }
  1753.  
  1754. extern __inline __m128i
  1755.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1756.     _mm_srl_epi16(__m128i __A, __m128i __B) {
  1757.   __v8hu __rshift;
  1758.   __vector __bool short __shmask;
  1759.   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
  1760.   __v8hu __result;
  1761.  
  1762. #ifdef __LITTLE_ENDIAN__
  1763.   __rshift = vec_splat((__v8hu)__B, 0);
  1764. #else
  1765.   __rshift = vec_splat((__v8hu)__B, 3);
  1766. #endif
  1767.   __shmask = vec_cmple(__rshift, __shmax);
  1768.   __result = vec_sr((__v8hu)__A, __rshift);
  1769.   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
  1770.  
  1771.   return (__m128i)__result;
  1772. }
  1773.  
  1774. extern __inline __m128i
  1775.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1776.     _mm_srl_epi32(__m128i __A, __m128i __B) {
  1777.   __v4su __rshift;
  1778.   __vector __bool int __shmask;
  1779.   const __v4su __shmax = {32, 32, 32, 32};
  1780.   __v4su __result;
  1781.  
  1782. #ifdef __LITTLE_ENDIAN__
  1783.   __rshift = vec_splat((__v4su)__B, 0);
  1784. #else
  1785.   __rshift = vec_splat((__v4su)__B, 1);
  1786. #endif
  1787.   __shmask = vec_cmplt(__rshift, __shmax);
  1788.   __result = vec_sr((__v4su)__A, __rshift);
  1789.   __result = vec_sel((__v4su)__shmask, __result, __shmask);
  1790.  
  1791.   return (__m128i)__result;
  1792. }
  1793.  
  1794. #ifdef _ARCH_PWR8
  1795. extern __inline __m128i
  1796.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1797.     _mm_srl_epi64(__m128i __A, __m128i __B) {
  1798.   __v2du __rshift;
  1799.   __vector __bool long long __shmask;
  1800.   const __v2du __shmax = {64, 64};
  1801.   __v2du __result;
  1802.  
  1803.   __rshift = vec_splat((__v2du)__B, 0);
  1804.   __shmask = vec_cmplt(__rshift, __shmax);
  1805.   __result = vec_sr((__v2du)__A, __rshift);
  1806.   __result = vec_sel((__v2du)__shmask, __result, __shmask);
  1807.  
  1808.   return (__m128i)__result;
  1809. }
  1810. #endif
  1811.  
  1812. extern __inline __m128d
  1813.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1814.     _mm_and_pd(__m128d __A, __m128d __B) {
  1815.   return (vec_and((__v2df)__A, (__v2df)__B));
  1816. }
  1817.  
  1818. extern __inline __m128d
  1819.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1820.     _mm_andnot_pd(__m128d __A, __m128d __B) {
  1821.   return (vec_andc((__v2df)__B, (__v2df)__A));
  1822. }
  1823.  
  1824. extern __inline __m128d
  1825.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1826.     _mm_or_pd(__m128d __A, __m128d __B) {
  1827.   return (vec_or((__v2df)__A, (__v2df)__B));
  1828. }
  1829.  
  1830. extern __inline __m128d
  1831.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1832.     _mm_xor_pd(__m128d __A, __m128d __B) {
  1833.   return (vec_xor((__v2df)__A, (__v2df)__B));
  1834. }
  1835.  
  1836. extern __inline __m128i
  1837.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1838.     _mm_and_si128(__m128i __A, __m128i __B) {
  1839.   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
  1840. }
  1841.  
  1842. extern __inline __m128i
  1843.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1844.     _mm_andnot_si128(__m128i __A, __m128i __B) {
  1845.   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
  1846. }
  1847.  
  1848. extern __inline __m128i
  1849.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1850.     _mm_or_si128(__m128i __A, __m128i __B) {
  1851.   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
  1852. }
  1853.  
  1854. extern __inline __m128i
  1855.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1856.     _mm_xor_si128(__m128i __A, __m128i __B) {
  1857.   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
  1858. }
  1859.  
  1860. extern __inline __m128i
  1861.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1862.     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
  1863.   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
  1864. }
  1865.  
  1866. extern __inline __m128i
  1867.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1868.     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
  1869.   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
  1870. }
  1871.  
  1872. extern __inline __m128i
  1873.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1874.     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
  1875.   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
  1876. }
  1877.  
  1878. extern __inline __m128i
  1879.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1880.     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
  1881.   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
  1882. }
  1883.  
  1884. extern __inline __m128i
  1885.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1886.     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
  1887.   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
  1888. }
  1889.  
  1890. extern __inline __m128i
  1891.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1892.     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
  1893.   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
  1894. }
  1895.  
  1896. extern __inline __m128i
  1897.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1898.     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
  1899.   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
  1900. }
  1901.  
  1902. extern __inline __m128i
  1903.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1904.     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
  1905.   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
  1906. }
  1907.  
  1908. extern __inline __m128i
  1909.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1910.     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
  1911.   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
  1912. }
  1913.  
  1914. extern __inline int
  1915.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1916.     _mm_extract_epi16(__m128i const __A, int const __N) {
  1917.   return (unsigned short)((__v8hi)__A)[__N & 7];
  1918. }
  1919.  
  1920. extern __inline __m128i
  1921.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1922.     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
  1923.   __v8hi __result = (__v8hi)__A;
  1924.  
  1925.   __result[(__N & 7)] = __D;
  1926.  
  1927.   return (__m128i)__result;
  1928. }
  1929.  
  1930. extern __inline __m128i
  1931.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1932.     _mm_max_epi16(__m128i __A, __m128i __B) {
  1933.   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
  1934. }
  1935.  
  1936. extern __inline __m128i
  1937.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1938.     _mm_max_epu8(__m128i __A, __m128i __B) {
  1939.   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
  1940. }
  1941.  
  1942. extern __inline __m128i
  1943.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1944.     _mm_min_epi16(__m128i __A, __m128i __B) {
  1945.   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
  1946. }
  1947.  
  1948. extern __inline __m128i
  1949.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1950.     _mm_min_epu8(__m128i __A, __m128i __B) {
  1951.   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
  1952. }
  1953.  
  1954. #ifdef _ARCH_PWR8
  1955. /* Intrinsic functions that require PowerISA 2.07 minimum.  */
  1956.  
  1957. /* Return a mask created from the most significant bit of each 8-bit
  1958.    element in A.  */
  1959. extern __inline int
  1960.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1961.     _mm_movemask_epi8(__m128i __A) {
  1962. #ifdef _ARCH_PWR10
  1963.   return vec_extractm((__v16qu)__A);
  1964. #else
  1965.   __vector unsigned long long __result;
  1966.   static const __vector unsigned char __perm_mask = {
  1967.       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
  1968.       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
  1969.  
  1970.   __result = ((__vector unsigned long long)vec_vbpermq(
  1971.       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
  1972.  
  1973. #ifdef __LITTLE_ENDIAN__
  1974.   return __result[1];
  1975. #else
  1976.   return __result[0];
  1977. #endif
  1978. #endif /* !_ARCH_PWR10 */
  1979. }
  1980. #endif /* _ARCH_PWR8 */
  1981.  
  1982. extern __inline __m128i
  1983.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  1984.     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
  1985.   __v4su __w0, __w1;
  1986.   __v16qu __xform1 = {
  1987. #ifdef __LITTLE_ENDIAN__
  1988.       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
  1989.       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
  1990. #else
  1991.       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
  1992.       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
  1993. #endif
  1994.   };
  1995.  
  1996.   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
  1997.   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
  1998.   return (__m128i)vec_perm(__w0, __w1, __xform1);
  1999. }
  2000.  
  2001. extern __inline __m128i
  2002.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2003.     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
  2004.   unsigned long __element_selector_98 = __mask & 0x03;
  2005.   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
  2006.   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
  2007.   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
  2008.   static const unsigned short __permute_selectors[4] = {
  2009. #ifdef __LITTLE_ENDIAN__
  2010.       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
  2011. #else
  2012.       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
  2013. #endif
  2014.   };
  2015.   __v2du __pmask =
  2016. #ifdef __LITTLE_ENDIAN__
  2017.       {0x1716151413121110UL, 0UL};
  2018. #else
  2019.       {0x1011121314151617UL, 0UL};
  2020. #endif
  2021.   __m64_union __t;
  2022.   __v2du __a, __r;
  2023.  
  2024.   __t.as_short[0] = __permute_selectors[__element_selector_98];
  2025.   __t.as_short[1] = __permute_selectors[__element_selector_BA];
  2026.   __t.as_short[2] = __permute_selectors[__element_selector_DC];
  2027.   __t.as_short[3] = __permute_selectors[__element_selector_FE];
  2028.   __pmask[1] = __t.as_m64;
  2029.   __a = (__v2du)__A;
  2030.   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
  2031.   return (__m128i)__r;
  2032. }
  2033.  
  2034. extern __inline __m128i
  2035.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2036.     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
  2037.   unsigned long __element_selector_10 = __mask & 0x03;
  2038.   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
  2039.   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
  2040.   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
  2041.   static const unsigned short __permute_selectors[4] = {
  2042. #ifdef __LITTLE_ENDIAN__
  2043.       0x0100, 0x0302, 0x0504, 0x0706
  2044. #else
  2045.       0x0001, 0x0203, 0x0405, 0x0607
  2046. #endif
  2047.   };
  2048.   __v2du __pmask =
  2049. #ifdef __LITTLE_ENDIAN__
  2050.       {0UL, 0x1f1e1d1c1b1a1918UL};
  2051. #else
  2052.       {0UL, 0x18191a1b1c1d1e1fUL};
  2053. #endif
  2054.   __m64_union __t;
  2055.   __v2du __a, __r;
  2056.   __t.as_short[0] = __permute_selectors[__element_selector_10];
  2057.   __t.as_short[1] = __permute_selectors[__element_selector_32];
  2058.   __t.as_short[2] = __permute_selectors[__element_selector_54];
  2059.   __t.as_short[3] = __permute_selectors[__element_selector_76];
  2060.   __pmask[0] = __t.as_m64;
  2061.   __a = (__v2du)__A;
  2062.   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
  2063.   return (__m128i)__r;
  2064. }
  2065.  
  2066. extern __inline __m128i
  2067.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2068.     _mm_shuffle_epi32(__m128i __A, const int __mask) {
  2069.   unsigned long __element_selector_10 = __mask & 0x03;
  2070.   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
  2071.   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
  2072.   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
  2073.   static const unsigned int __permute_selectors[4] = {
  2074. #ifdef __LITTLE_ENDIAN__
  2075.       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
  2076. #else
  2077.       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
  2078. #endif
  2079.   };
  2080.   __v4su __t;
  2081.  
  2082.   __t[0] = __permute_selectors[__element_selector_10];
  2083.   __t[1] = __permute_selectors[__element_selector_32];
  2084.   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
  2085.   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
  2086.   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
  2087.                            (__vector unsigned char)__t);
  2088. }
  2089.  
  2090. extern __inline void
  2091.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2092.     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
  2093.   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
  2094.   __v16qu __mask, __tmp;
  2095.   __m128i_u *__p = (__m128i_u *)__C;
  2096.  
  2097.   __tmp = (__v16qu)_mm_loadu_si128(__p);
  2098.   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
  2099.   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
  2100.   _mm_storeu_si128(__p, (__m128i)__tmp);
  2101. }
  2102.  
  2103. extern __inline __m128i
  2104.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2105.     _mm_avg_epu8(__m128i __A, __m128i __B) {
  2106.   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
  2107. }
  2108.  
  2109. extern __inline __m128i
  2110.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2111.     _mm_avg_epu16(__m128i __A, __m128i __B) {
  2112.   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
  2113. }
  2114.  
  2115. extern __inline __m128i
  2116.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2117.     _mm_sad_epu8(__m128i __A, __m128i __B) {
  2118.   __v16qu __a, __b;
  2119.   __v16qu __vabsdiff;
  2120.   __v4si __vsum;
  2121.   const __v4su __zero = {0, 0, 0, 0};
  2122.   __v4si __result;
  2123.  
  2124.   __a = (__v16qu)__A;
  2125.   __b = (__v16qu)__B;
  2126. #ifndef _ARCH_PWR9
  2127.   __v16qu __vmin = vec_min(__a, __b);
  2128.   __v16qu __vmax = vec_max(__a, __b);
  2129.   __vabsdiff = vec_sub(__vmax, __vmin);
  2130. #else
  2131.   __vabsdiff = vec_absd(__a, __b);
  2132. #endif
  2133.   /* Sum four groups of bytes into integers.  */
  2134.   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
  2135. #ifdef __LITTLE_ENDIAN__
  2136.   /* Sum across four integers with two integer results.  */
  2137.   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
  2138.   /* Note: vec_sum2s could be used here, but on little-endian, vector
  2139.      shifts are added that are not needed for this use-case.
  2140.      A vector shift to correctly position the 32-bit integer results
  2141.      (currently at [0] and [2]) to [1] and [3] would then need to be
  2142.      swapped back again since the desired results are two 64-bit
  2143.      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
  2144. #else
  2145.   /* Sum across four integers with two integer results.  */
  2146.   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
  2147.   /* Rotate the sums into the correct position.  */
  2148.   __result = vec_sld(__result, __result, 6);
  2149. #endif
  2150.   return (__m128i)__result;
  2151. }
  2152.  
  2153. extern __inline void
  2154.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2155.     _mm_stream_si32(int *__A, int __B) {
  2156.   /* Use the data cache block touch for store transient.  */
  2157.   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
  2158.   *__A = __B;
  2159. }
  2160.  
  2161. extern __inline void
  2162.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2163.     _mm_stream_si64(long long int *__A, long long int __B) {
  2164.   /* Use the data cache block touch for store transient.  */
  2165.   __asm__("     dcbtstt 0,%0" : : "b"(__A) : "memory");
  2166.   *__A = __B;
  2167. }
  2168.  
  2169. extern __inline void
  2170.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2171.     _mm_stream_si128(__m128i *__A, __m128i __B) {
  2172.   /* Use the data cache block touch for store transient.  */
  2173.   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
  2174.   *__A = __B;
  2175. }
  2176.  
  2177. extern __inline void
  2178.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2179.     _mm_stream_pd(double *__A, __m128d __B) {
  2180.   /* Use the data cache block touch for store transient.  */
  2181.   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
  2182.   *(__m128d *)__A = __B;
  2183. }
  2184.  
  2185. extern __inline void
  2186.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2187.     _mm_clflush(void const *__A) {
  2188.   /* Use the data cache block flush.  */
  2189.   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
  2190. }
  2191.  
  2192. extern __inline void
  2193.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2194.     _mm_lfence(void) {
  2195.   /* Use light weight sync for load to load ordering.  */
  2196.   __atomic_thread_fence(__ATOMIC_RELEASE);
  2197. }
  2198.  
  2199. extern __inline void
  2200.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2201.     _mm_mfence(void) {
  2202.   /* Use heavy weight sync for any to any ordering.  */
  2203.   __atomic_thread_fence(__ATOMIC_SEQ_CST);
  2204. }
  2205.  
  2206. extern __inline __m128i
  2207.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2208.     _mm_cvtsi32_si128(int __A) {
  2209.   return _mm_set_epi32(0, 0, 0, __A);
  2210. }
  2211.  
  2212. extern __inline __m128i
  2213.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2214.     _mm_cvtsi64_si128(long long __A) {
  2215.   return __extension__(__m128i)(__v2di){__A, 0LL};
  2216. }
  2217.  
  2218. /* Microsoft intrinsic.  */
  2219. extern __inline __m128i
  2220.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2221.     _mm_cvtsi64x_si128(long long __A) {
  2222.   return __extension__(__m128i)(__v2di){__A, 0LL};
  2223. }
  2224.  
  2225. /* Casts between various SP, DP, INT vector types.  Note that these do no
  2226.    conversion of values, they just change the type.  */
  2227. extern __inline __m128
  2228.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2229.     _mm_castpd_ps(__m128d __A) {
  2230.   return (__m128)__A;
  2231. }
  2232.  
  2233. extern __inline __m128i
  2234.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2235.     _mm_castpd_si128(__m128d __A) {
  2236.   return (__m128i)__A;
  2237. }
  2238.  
  2239. extern __inline __m128d
  2240.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2241.     _mm_castps_pd(__m128 __A) {
  2242.   return (__m128d)__A;
  2243. }
  2244.  
  2245. extern __inline __m128i
  2246.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2247.     _mm_castps_si128(__m128 __A) {
  2248.   return (__m128i)__A;
  2249. }
  2250.  
  2251. extern __inline __m128
  2252.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2253.     _mm_castsi128_ps(__m128i __A) {
  2254.   return (__m128)__A;
  2255. }
  2256.  
  2257. extern __inline __m128d
  2258.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  2259.     _mm_castsi128_pd(__m128i __A) {
  2260.   return (__m128d)__A;
  2261. }
  2262.  
  2263. #else
  2264. #include_next <emmintrin.h>
  2265. #endif /* defined(__powerpc64__) &&                                            \
  2266.         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
  2267.  
  2268. #endif /* EMMINTRIN_H_ */
  2269.