Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. /* Implemented from the specification included in the Intel C++ Compiler
  11.    User Guide and Reference, version 9.0.
  12.  
  13.    NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
  14.  
  15. #ifndef NO_WARN_X86_INTRINSICS
  16. /* This header is distributed to simplify porting x86_64 code that
  17.    makes explicit use of Intel intrinsics to powerp64/powerpc64le.
  18.  
  19.    It is the user's responsibility to determine if the results are
  20.    acceptable and make additional changes as necessary.
  21.  
  22.    Note that much code that uses Intel intrinsics can be rewritten in
  23.    standard C or GNU C extensions, which are more portable and better
  24.    optimized across multiple targets.  */
  25. #error                                                                         \
  26.     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  27. #endif
  28.  
  29. #ifndef SMMINTRIN_H_
  30. #define SMMINTRIN_H_
  31.  
  32. #if defined(__powerpc64__) &&                                                  \
  33.     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  34.  
  35. #include <altivec.h>
  36. #include <tmmintrin.h>
  37.  
  38. /* Rounding mode macros. */
  39. #define _MM_FROUND_TO_NEAREST_INT 0x00
  40. #define _MM_FROUND_TO_ZERO 0x01
  41. #define _MM_FROUND_TO_POS_INF 0x02
  42. #define _MM_FROUND_TO_NEG_INF 0x03
  43. #define _MM_FROUND_CUR_DIRECTION 0x04
  44.  
  45. #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
  46. #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
  47. #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
  48. #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
  49. #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
  50. #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
  51.  
  52. #define _MM_FROUND_RAISE_EXC 0x00
  53. #define _MM_FROUND_NO_EXC 0x08
  54.  
  55. extern __inline __m128d
  56.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  57.     _mm_round_pd(__m128d __A, int __rounding) {
  58.   __v2df __r;
  59.   union {
  60.     double __fr;
  61.     long long __fpscr;
  62.   } __enables_save, __fpscr_save;
  63.  
  64.   if (__rounding & _MM_FROUND_NO_EXC) {
  65.     /* Save enabled exceptions, disable all exceptions,
  66.        and preserve the rounding mode.  */
  67. #ifdef _ARCH_PWR9
  68.     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
  69.     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
  70. #else
  71.     __fpscr_save.__fr = __builtin_mffs();
  72.     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
  73.     __fpscr_save.__fpscr &= ~0xf8;
  74.     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  75. #endif
  76.     /* Insert an artificial "read/write" reference to the variable
  77.        read below, to ensure the compiler does not schedule
  78.        a read/use of the variable before the FPSCR is modified, above.
  79.        This can be removed if and when GCC PR102783 is fixed.
  80.      */
  81.     __asm__("" : "+wa"(__A));
  82.   }
  83.  
  84.   switch (__rounding) {
  85.   case _MM_FROUND_TO_NEAREST_INT:
  86.     __fpscr_save.__fr = __builtin_mffsl();
  87.     __attribute__((fallthrough));
  88.   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
  89.     __builtin_set_fpscr_rn(0b00);
  90.     /* Insert an artificial "read/write" reference to the variable
  91.        read below, to ensure the compiler does not schedule
  92.        a read/use of the variable before the FPSCR is modified, above.
  93.        This can be removed if and when GCC PR102783 is fixed.
  94.      */
  95.     __asm__("" : "+wa"(__A));
  96.  
  97.     __r = vec_rint((__v2df)__A);
  98.  
  99.     /* Insert an artificial "read" reference to the variable written
  100.        above, to ensure the compiler does not schedule the computation
  101.        of the value after the manipulation of the FPSCR, below.
  102.        This can be removed if and when GCC PR102783 is fixed.
  103.      */
  104.     __asm__("" : : "wa"(__r));
  105.     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
  106.     break;
  107.   case _MM_FROUND_TO_NEG_INF:
  108.   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
  109.     __r = vec_floor((__v2df)__A);
  110.     break;
  111.   case _MM_FROUND_TO_POS_INF:
  112.   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
  113.     __r = vec_ceil((__v2df)__A);
  114.     break;
  115.   case _MM_FROUND_TO_ZERO:
  116.   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
  117.     __r = vec_trunc((__v2df)__A);
  118.     break;
  119.   case _MM_FROUND_CUR_DIRECTION:
  120.     __r = vec_rint((__v2df)__A);
  121.     break;
  122.   }
  123.   if (__rounding & _MM_FROUND_NO_EXC) {
  124.     /* Insert an artificial "read" reference to the variable written
  125.        above, to ensure the compiler does not schedule the computation
  126.        of the value after the manipulation of the FPSCR, below.
  127.        This can be removed if and when GCC PR102783 is fixed.
  128.      */
  129.     __asm__("" : : "wa"(__r));
  130.     /* Restore enabled exceptions.  */
  131.     __fpscr_save.__fr = __builtin_mffsl();
  132.     __fpscr_save.__fpscr |= __enables_save.__fpscr;
  133.     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  134.   }
  135.   return (__m128d)__r;
  136. }
  137.  
  138. extern __inline __m128d
  139.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  140.     _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
  141.   __B = _mm_round_pd(__B, __rounding);
  142.   __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
  143.   return (__m128d)__r;
  144. }
  145.  
  146. extern __inline __m128
  147.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  148.     _mm_round_ps(__m128 __A, int __rounding) {
  149.   __v4sf __r;
  150.   union {
  151.     double __fr;
  152.     long long __fpscr;
  153.   } __enables_save, __fpscr_save;
  154.  
  155.   if (__rounding & _MM_FROUND_NO_EXC) {
  156.     /* Save enabled exceptions, disable all exceptions,
  157.        and preserve the rounding mode.  */
  158. #ifdef _ARCH_PWR9
  159.     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
  160.     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
  161. #else
  162.     __fpscr_save.__fr = __builtin_mffs();
  163.     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
  164.     __fpscr_save.__fpscr &= ~0xf8;
  165.     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  166. #endif
  167.     /* Insert an artificial "read/write" reference to the variable
  168.        read below, to ensure the compiler does not schedule
  169.        a read/use of the variable before the FPSCR is modified, above.
  170.        This can be removed if and when GCC PR102783 is fixed.
  171.      */
  172.     __asm__("" : "+wa"(__A));
  173.   }
  174.  
  175.   switch (__rounding) {
  176.   case _MM_FROUND_TO_NEAREST_INT:
  177.     __fpscr_save.__fr = __builtin_mffsl();
  178.     __attribute__((fallthrough));
  179.   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
  180.     __builtin_set_fpscr_rn(0b00);
  181.     /* Insert an artificial "read/write" reference to the variable
  182.        read below, to ensure the compiler does not schedule
  183.        a read/use of the variable before the FPSCR is modified, above.
  184.        This can be removed if and when GCC PR102783 is fixed.
  185.      */
  186.     __asm__("" : "+wa"(__A));
  187.  
  188.     __r = vec_rint((__v4sf)__A);
  189.  
  190.     /* Insert an artificial "read" reference to the variable written
  191.        above, to ensure the compiler does not schedule the computation
  192.        of the value after the manipulation of the FPSCR, below.
  193.        This can be removed if and when GCC PR102783 is fixed.
  194.      */
  195.     __asm__("" : : "wa"(__r));
  196.     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
  197.     break;
  198.   case _MM_FROUND_TO_NEG_INF:
  199.   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
  200.     __r = vec_floor((__v4sf)__A);
  201.     break;
  202.   case _MM_FROUND_TO_POS_INF:
  203.   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
  204.     __r = vec_ceil((__v4sf)__A);
  205.     break;
  206.   case _MM_FROUND_TO_ZERO:
  207.   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
  208.     __r = vec_trunc((__v4sf)__A);
  209.     break;
  210.   case _MM_FROUND_CUR_DIRECTION:
  211.     __r = vec_rint((__v4sf)__A);
  212.     break;
  213.   }
  214.   if (__rounding & _MM_FROUND_NO_EXC) {
  215.     /* Insert an artificial "read" reference to the variable written
  216.        above, to ensure the compiler does not schedule the computation
  217.        of the value after the manipulation of the FPSCR, below.
  218.        This can be removed if and when GCC PR102783 is fixed.
  219.      */
  220.     __asm__("" : : "wa"(__r));
  221.     /* Restore enabled exceptions.  */
  222.     __fpscr_save.__fr = __builtin_mffsl();
  223.     __fpscr_save.__fpscr |= __enables_save.__fpscr;
  224.     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  225.   }
  226.   return (__m128)__r;
  227. }
  228.  
  229. extern __inline __m128
  230.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  231.     _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
  232.   __B = _mm_round_ps(__B, __rounding);
  233.   __v4sf __r = (__v4sf)__A;
  234.   __r[0] = ((__v4sf)__B)[0];
  235.   return (__m128)__r;
  236. }
  237.  
  238. #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
  239. #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
  240.  
  241. #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
  242. #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
  243.  
  244. #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
  245. #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
  246.  
  247. #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
  248. #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
  249.  
  250. extern __inline __m128i
  251.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  252.     _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
  253.   __v16qi __result = (__v16qi)__A;
  254.  
  255.   __result[__N & 0xf] = __D;
  256.  
  257.   return (__m128i)__result;
  258. }
  259.  
  260. extern __inline __m128i
  261.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  262.     _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
  263.   __v4si __result = (__v4si)__A;
  264.  
  265.   __result[__N & 3] = __D;
  266.  
  267.   return (__m128i)__result;
  268. }
  269.  
  270. extern __inline __m128i
  271.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  272.     _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
  273.   __v2di __result = (__v2di)__A;
  274.  
  275.   __result[__N & 1] = __D;
  276.  
  277.   return (__m128i)__result;
  278. }
  279.  
  280. extern __inline int
  281.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  282.     _mm_extract_epi8(__m128i __X, const int __N) {
  283.   return (unsigned char)((__v16qi)__X)[__N & 15];
  284. }
  285.  
  286. extern __inline int
  287.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  288.     _mm_extract_epi32(__m128i __X, const int __N) {
  289.   return ((__v4si)__X)[__N & 3];
  290. }
  291.  
  292. extern __inline int
  293.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  294.     _mm_extract_epi64(__m128i __X, const int __N) {
  295.   return ((__v2di)__X)[__N & 1];
  296. }
  297.  
  298. extern __inline int
  299.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  300.     _mm_extract_ps(__m128 __X, const int __N) {
  301.   return ((__v4si)__X)[__N & 3];
  302. }
  303.  
  304. #ifdef _ARCH_PWR8
  305. extern __inline __m128i
  306.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  307.     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
  308.   __v16qi __charmask = vec_splats((signed char)__imm8);
  309.   __charmask = vec_gb(__charmask);
  310.   __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
  311. #ifdef __BIG_ENDIAN__
  312.   __shortmask = vec_reve(__shortmask);
  313. #endif
  314.   return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
  315. }
  316. #endif
  317.  
  318. extern __inline __m128i
  319.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  320.     _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
  321. #ifdef _ARCH_PWR10
  322.   return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
  323. #else
  324.   const __v16qu __seven = vec_splats((unsigned char)0x07);
  325.   __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
  326.   return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
  327. #endif
  328. }
  329.  
  330. extern __inline __m128
  331.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  332.     _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
  333.   __v16qu __pcv[] = {
  334.       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  335.       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  336.       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
  337.       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
  338.       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
  339.       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
  340.       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
  341.       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
  342.       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
  343.       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
  344.       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
  345.       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
  346.       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
  347.       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
  348.       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
  349.       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
  350.   };
  351.   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
  352.   return (__m128)__r;
  353. }
  354.  
  355. extern __inline __m128
  356.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  357.     _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
  358. #ifdef _ARCH_PWR10
  359.   return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
  360. #else
  361.   const __v4si __zero = {0};
  362.   const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
  363.   return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
  364. #endif
  365. }
  366.  
  367. extern __inline __m128d
  368.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  369.     _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
  370.   __v16qu __pcv[] = {
  371.       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
  372.       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
  373.       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
  374.       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
  375.   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
  376.   return (__m128d)__r;
  377. }
  378.  
  379. #ifdef _ARCH_PWR8
  380. extern __inline __m128d
  381.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  382.     _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
  383. #ifdef _ARCH_PWR10
  384.   return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
  385. #else
  386.   const __v2di __zero = {0};
  387.   const __vector __bool long long __boolmask =
  388.       vec_cmplt((__v2di)__mask, __zero);
  389.   return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
  390. #endif
  391. }
  392. #endif
  393.  
  394. extern __inline int
  395.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  396.     _mm_testz_si128(__m128i __A, __m128i __B) {
  397.   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
  398.   const __v16qu __zero = {0};
  399.   return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
  400. }
  401.  
  402. extern __inline int
  403.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  404.     _mm_testc_si128(__m128i __A, __m128i __B) {
  405.   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
  406.   const __v16qu __zero = {0};
  407.   const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
  408.   return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
  409. }
  410.  
  411. extern __inline int
  412.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  413.     _mm_testnzc_si128(__m128i __A, __m128i __B) {
  414.   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
  415.   return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
  416. }
  417.  
  418. #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
  419.  
  420. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
  421.  
  422. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
  423.  
  424. #ifdef _ARCH_PWR8
  425. extern __inline __m128i
  426.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  427.     _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
  428.   return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
  429. }
  430. #endif
  431.  
  432. extern __inline __m128i
  433.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  434.     _mm_min_epi8(__m128i __X, __m128i __Y) {
  435.   return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
  436. }
  437.  
  438. extern __inline __m128i
  439.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  440.     _mm_min_epu16(__m128i __X, __m128i __Y) {
  441.   return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
  442. }
  443.  
  444. extern __inline __m128i
  445.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  446.     _mm_min_epi32(__m128i __X, __m128i __Y) {
  447.   return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
  448. }
  449.  
  450. extern __inline __m128i
  451.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  452.     _mm_min_epu32(__m128i __X, __m128i __Y) {
  453.   return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
  454. }
  455.  
  456. extern __inline __m128i
  457.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  458.     _mm_max_epi8(__m128i __X, __m128i __Y) {
  459.   return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
  460. }
  461.  
  462. extern __inline __m128i
  463.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  464.     _mm_max_epu16(__m128i __X, __m128i __Y) {
  465.   return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
  466. }
  467.  
  468. extern __inline __m128i
  469.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  470.     _mm_max_epi32(__m128i __X, __m128i __Y) {
  471.   return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
  472. }
  473.  
  474. extern __inline __m128i
  475.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  476.     _mm_max_epu32(__m128i __X, __m128i __Y) {
  477.   return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
  478. }
  479.  
  480. extern __inline __m128i
  481.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  482.     _mm_mullo_epi32(__m128i __X, __m128i __Y) {
  483.   return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
  484. }
  485.  
  486. #ifdef _ARCH_PWR8
  487. extern __inline __m128i
  488.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  489.     _mm_mul_epi32(__m128i __X, __m128i __Y) {
  490.   return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
  491. }
  492. #endif
  493.  
  494. extern __inline __m128i
  495.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  496.     _mm_cvtepi8_epi16(__m128i __A) {
  497.   return (__m128i)vec_unpackh((__v16qi)__A);
  498. }
  499.  
  500. extern __inline __m128i
  501.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  502.     _mm_cvtepi8_epi32(__m128i __A) {
  503.   __A = (__m128i)vec_unpackh((__v16qi)__A);
  504.   return (__m128i)vec_unpackh((__v8hi)__A);
  505. }
  506.  
  507. #ifdef _ARCH_PWR8
  508. extern __inline __m128i
  509.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  510.     _mm_cvtepi8_epi64(__m128i __A) {
  511.   __A = (__m128i)vec_unpackh((__v16qi)__A);
  512.   __A = (__m128i)vec_unpackh((__v8hi)__A);
  513.   return (__m128i)vec_unpackh((__v4si)__A);
  514. }
  515. #endif
  516.  
  517. extern __inline __m128i
  518.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  519.     _mm_cvtepi16_epi32(__m128i __A) {
  520.   return (__m128i)vec_unpackh((__v8hi)__A);
  521. }
  522.  
  523. #ifdef _ARCH_PWR8
  524. extern __inline __m128i
  525.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  526.     _mm_cvtepi16_epi64(__m128i __A) {
  527.   __A = (__m128i)vec_unpackh((__v8hi)__A);
  528.   return (__m128i)vec_unpackh((__v4si)__A);
  529. }
  530. #endif
  531.  
  532. #ifdef _ARCH_PWR8
  533. extern __inline __m128i
  534.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  535.     _mm_cvtepi32_epi64(__m128i __A) {
  536.   return (__m128i)vec_unpackh((__v4si)__A);
  537. }
  538. #endif
  539.  
  540. extern __inline __m128i
  541.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  542.     _mm_cvtepu8_epi16(__m128i __A) {
  543.   const __v16qu __zero = {0};
  544. #ifdef __LITTLE_ENDIAN__
  545.   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
  546. #else  /* __BIG_ENDIAN__.  */
  547.   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
  548. #endif /* __BIG_ENDIAN__.  */
  549.   return __A;
  550. }
  551.  
  552. extern __inline __m128i
  553.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  554.     _mm_cvtepu8_epi32(__m128i __A) {
  555.   const __v16qu __zero = {0};
  556. #ifdef __LITTLE_ENDIAN__
  557.   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
  558.   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
  559. #else  /* __BIG_ENDIAN__.  */
  560.   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
  561.   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
  562. #endif /* __BIG_ENDIAN__.  */
  563.   return __A;
  564. }
  565.  
  566. extern __inline __m128i
  567.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  568.     _mm_cvtepu8_epi64(__m128i __A) {
  569.   const __v16qu __zero = {0};
  570. #ifdef __LITTLE_ENDIAN__
  571.   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
  572.   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
  573.   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
  574. #else  /* __BIG_ENDIAN__.  */
  575.   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
  576.   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
  577.   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
  578. #endif /* __BIG_ENDIAN__.  */
  579.   return __A;
  580. }
  581.  
  582. extern __inline __m128i
  583.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  584.     _mm_cvtepu16_epi32(__m128i __A) {
  585.   const __v8hu __zero = {0};
  586. #ifdef __LITTLE_ENDIAN__
  587.   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
  588. #else  /* __BIG_ENDIAN__.  */
  589.   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
  590. #endif /* __BIG_ENDIAN__.  */
  591.   return __A;
  592. }
  593.  
  594. extern __inline __m128i
  595.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  596.     _mm_cvtepu16_epi64(__m128i __A) {
  597.   const __v8hu __zero = {0};
  598. #ifdef __LITTLE_ENDIAN__
  599.   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
  600.   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
  601. #else  /* __BIG_ENDIAN__.  */
  602.   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
  603.   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
  604. #endif /* __BIG_ENDIAN__.  */
  605.   return __A;
  606. }
  607.  
  608. extern __inline __m128i
  609.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  610.     _mm_cvtepu32_epi64(__m128i __A) {
  611.   const __v4su __zero = {0};
  612. #ifdef __LITTLE_ENDIAN__
  613.   __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
  614. #else  /* __BIG_ENDIAN__.  */
  615.   __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
  616. #endif /* __BIG_ENDIAN__.  */
  617.   return __A;
  618. }
  619.  
  620. /* Return horizontal packed word minimum and its index in bits [15:0]
  621.    and bits [18:16] respectively.  */
  622. extern __inline __m128i
  623.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  624.     _mm_minpos_epu16(__m128i __A) {
  625.   union __u {
  626.     __m128i __m;
  627.     __v8hu __uh;
  628.   };
  629.   union __u __u = {.__m = __A}, __r = {.__m = {0}};
  630.   unsigned short __ridx = 0;
  631.   unsigned short __rmin = __u.__uh[__ridx];
  632.   unsigned long __i;
  633.   for (__i = 1; __i < 8; __i++) {
  634.     if (__u.__uh[__i] < __rmin) {
  635.       __rmin = __u.__uh[__i];
  636.       __ridx = __i;
  637.     }
  638.   }
  639.   __r.__uh[0] = __rmin;
  640.   __r.__uh[1] = __ridx;
  641.   return __r.__m;
  642. }
  643.  
  644. extern __inline __m128i
  645.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  646.     _mm_packus_epi32(__m128i __X, __m128i __Y) {
  647.   return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
  648. }
  649.  
  650. #ifdef _ARCH_PWR8
  651. extern __inline __m128i
  652.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  653.     _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
  654.   return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
  655. }
  656. #endif
  657.  
  658. #else
  659. #include_next <smmintrin.h>
  660. #endif /* defined(__powerpc64__) &&                                            \
  661.         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
  662.  
  663. #endif /* SMMINTRIN_H_ */
  664.