Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #if !defined X86GPRINTRIN_H_
  11. #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
  12. #endif
  13.  
  14. #ifndef BMI2INTRIN_H_
  15. #define BMI2INTRIN_H_
  16.  
  17. extern __inline unsigned int
  18.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  19.     _bzhi_u32(unsigned int __X, unsigned int __Y) {
  20.   return ((__X << (32 - __Y)) >> (32 - __Y));
  21. }
  22.  
  23. extern __inline unsigned int
  24.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  25.     _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
  26.   unsigned long long __res = (unsigned long long)__X * __Y;
  27.   *__P = (unsigned int)(__res >> 32);
  28.   return (unsigned int)__res;
  29. }
  30.  
  31. #ifdef __PPC64__
  32. extern __inline unsigned long long
  33.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  34.     _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
  35.   return ((__X << (64 - __Y)) >> (64 - __Y));
  36. }
  37.  
  38. /* __int128 requires base 64-bit.  */
  39. extern __inline unsigned long long
  40.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  41.     _mulx_u64(unsigned long long __X, unsigned long long __Y,
  42.               unsigned long long *__P) {
  43.   unsigned __int128 __res = (unsigned __int128)__X * __Y;
  44.   *__P = (unsigned long long)(__res >> 64);
  45.   return (unsigned long long)__res;
  46. }
  47.  
  48. #ifdef _ARCH_PWR7
  49. /* popcount and bpermd require power7 minimum.  */
  50. extern __inline unsigned long long
  51.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  52.     _pdep_u64(unsigned long long __X, unsigned long long __M) {
  53.   unsigned long __result = 0x0UL;
  54.   const unsigned long __mask = 0x8000000000000000UL;
  55.   unsigned long __m = __M;
  56.   unsigned long __c, __t;
  57.   unsigned long __p;
  58.  
  59.   /* The pop-count of the mask gives the number of the bits from
  60.    source to process.  This is also needed to shift bits from the
  61.    source into the correct position for the result.  */
  62.   __p = 64 - __builtin_popcountl(__M);
  63.  
  64.   /* The loop is for the number of '1' bits in the mask and clearing
  65.    each mask bit as it is processed.  */
  66.   while (__m != 0) {
  67.     __c = __builtin_clzl(__m);
  68.     __t = __X << (__p - __c);
  69.     __m ^= (__mask >> __c);
  70.     __result |= (__t & (__mask >> __c));
  71.     __p++;
  72.   }
  73.   return __result;
  74. }
  75.  
  76. extern __inline unsigned long long
  77.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  78.     _pext_u64(unsigned long long __X, unsigned long long __M) {
  79.   unsigned long __p = 0x4040404040404040UL; // initial bit permute control
  80.   const unsigned long __mask = 0x8000000000000000UL;
  81.   unsigned long __m = __M;
  82.   unsigned long __c;
  83.   unsigned long __result;
  84.  
  85.   /* if the mask is constant and selects 8 bits or less we can use
  86.    the Power8 Bit permute instruction.  */
  87.   if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
  88.     /* Also if the pext mask is constant, then the popcount is
  89.      constant, we can evaluate the following loop at compile
  90.      time and use a constant bit permute vector.  */
  91.     long __i;
  92.     for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
  93.       __c = __builtin_clzl(__m);
  94.       __p = (__p << 8) | __c;
  95.       __m ^= (__mask >> __c);
  96.     }
  97.     __result = __builtin_bpermd(__p, __X);
  98.   } else {
  99.     __p = 64 - __builtin_popcountl(__M);
  100.     __result = 0;
  101.     /* We could a use a for loop here, but that combined with
  102.      -funroll-loops can expand to a lot of code.  The while
  103.      loop avoids unrolling and the compiler commons the xor
  104.      from clearing the mask bit with the (m != 0) test.  The
  105.      result is a more compact loop setup and body.  */
  106.     while (__m != 0) {
  107.       unsigned long __t;
  108.       __c = __builtin_clzl(__m);
  109.       __t = (__X & (__mask >> __c)) >> (__p - __c);
  110.       __m ^= (__mask >> __c);
  111.       __result |= (__t);
  112.       __p++;
  113.     }
  114.   }
  115.   return __result;
  116. }
  117.  
  118. /* these 32-bit implementations depend on 64-bit pdep/pext
  119.    which depend on _ARCH_PWR7.  */
  120. extern __inline unsigned int
  121.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  122.     _pdep_u32(unsigned int __X, unsigned int __Y) {
  123.   return _pdep_u64(__X, __Y);
  124. }
  125.  
  126. extern __inline unsigned int
  127.     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  128.     _pext_u32(unsigned int __X, unsigned int __Y) {
  129.   return _pext_u64(__X, __Y);
  130. }
  131. #endif /* _ARCH_PWR7  */
  132. #endif /* __PPC64__  */
  133.  
  134. #endif /* BMI2INTRIN_H_ */
  135.