Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== | 
| 2 |  * | ||
| 3 |  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 |  * See https://llvm.org/LICENSE.txt for license information. | ||
| 5 |  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 |  * | ||
| 7 |  *===-----------------------------------------------------------------------=== | ||
| 8 |  */ | ||
| 9 | |||
| 10 | #if !defined X86GPRINTRIN_H_ | ||
| 11 | #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." | ||
| 12 | #endif | ||
| 13 | |||
| 14 | #ifndef BMI2INTRIN_H_ | ||
| 15 | #define BMI2INTRIN_H_ | ||
| 16 | |||
| 17 | extern __inline unsigned int | ||
| 18 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 19 | _bzhi_u32(unsigned int __X, unsigned int __Y) { | ||
| 20 | return ((__X << (32 - __Y)) >> (32 - __Y)); | ||
| 21 | } | ||
| 22 | |||
| 23 | extern __inline unsigned int | ||
| 24 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 25 | _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { | ||
| 26 | unsigned long long __res = (unsigned long long)__X * __Y; | ||
| 27 | *__P = (unsigned int)(__res >> 32); | ||
| 28 | return (unsigned int)__res; | ||
| 29 | } | ||
| 30 | |||
| 31 | #ifdef __PPC64__ | ||
| 32 | extern __inline unsigned long long | ||
| 33 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 34 | _bzhi_u64(unsigned long long __X, unsigned long long __Y) { | ||
| 35 | return ((__X << (64 - __Y)) >> (64 - __Y)); | ||
| 36 | } | ||
| 37 | |||
| 38 | /* __int128 requires base 64-bit.  */ | ||
| 39 | extern __inline unsigned long long | ||
| 40 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 41 | _mulx_u64(unsigned long long __X, unsigned long long __Y, | ||
| 42 | unsigned long long *__P) { | ||
| 43 | unsigned __int128 __res = (unsigned __int128)__X * __Y; | ||
| 44 | *__P = (unsigned long long)(__res >> 64); | ||
| 45 | return (unsigned long long)__res; | ||
| 46 | } | ||
| 47 | |||
| 48 | #ifdef _ARCH_PWR7 | ||
| 49 | /* popcount and bpermd require power7 minimum.  */ | ||
| 50 | extern __inline unsigned long long | ||
| 51 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 52 | _pdep_u64(unsigned long long __X, unsigned long long __M) { | ||
| 53 | unsigned long __result = 0x0UL; | ||
| 54 | const unsigned long __mask = 0x8000000000000000UL; | ||
| 55 | unsigned long __m = __M; | ||
| 56 | unsigned long __c, __t; | ||
| 57 | unsigned long __p; | ||
| 58 | |||
| 59 |   /* The pop-count of the mask gives the number of the bits from | ||
| 60 |    source to process.  This is also needed to shift bits from the | ||
| 61 |    source into the correct position for the result.  */ | ||
| 62 | __p = 64 - __builtin_popcountl(__M); | ||
| 63 | |||
| 64 |   /* The loop is for the number of '1' bits in the mask and clearing | ||
| 65 |    each mask bit as it is processed.  */ | ||
| 66 | while (__m != 0) { | ||
| 67 | __c = __builtin_clzl(__m); | ||
| 68 | __t = __X << (__p - __c); | ||
| 69 | __m ^= (__mask >> __c); | ||
| 70 | __result |= (__t & (__mask >> __c)); | ||
| 71 | __p++; | ||
| 72 |   } | ||
| 73 | return __result; | ||
| 74 | } | ||
| 75 | |||
| 76 | extern __inline unsigned long long | ||
| 77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 78 | _pext_u64(unsigned long long __X, unsigned long long __M) { | ||
| 79 | unsigned long __p = 0x4040404040404040UL; // initial bit permute control | ||
| 80 | const unsigned long __mask = 0x8000000000000000UL; | ||
| 81 | unsigned long __m = __M; | ||
| 82 | unsigned long __c; | ||
| 83 | unsigned long __result; | ||
| 84 | |||
| 85 |   /* if the mask is constant and selects 8 bits or less we can use | ||
| 86 |    the Power8 Bit permute instruction.  */ | ||
| 87 | if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { | ||
| 88 |     /* Also if the pext mask is constant, then the popcount is | ||
| 89 |      constant, we can evaluate the following loop at compile | ||
| 90 |      time and use a constant bit permute vector.  */ | ||
| 91 | long __i; | ||
| 92 | for (__i = 0; __i < __builtin_popcountl(__M); __i++) { | ||
| 93 | __c = __builtin_clzl(__m); | ||
| 94 | __p = (__p << 8) | __c; | ||
| 95 | __m ^= (__mask >> __c); | ||
| 96 |     } | ||
| 97 | __result = __builtin_bpermd(__p, __X); | ||
| 98 | } else { | ||
| 99 | __p = 64 - __builtin_popcountl(__M); | ||
| 100 | __result = 0; | ||
| 101 |     /* We could a use a for loop here, but that combined with | ||
| 102 |      -funroll-loops can expand to a lot of code.  The while | ||
| 103 |      loop avoids unrolling and the compiler commons the xor | ||
| 104 |      from clearing the mask bit with the (m != 0) test.  The | ||
| 105 |      result is a more compact loop setup and body.  */ | ||
| 106 | while (__m != 0) { | ||
| 107 | unsigned long __t; | ||
| 108 | __c = __builtin_clzl(__m); | ||
| 109 | __t = (__X & (__mask >> __c)) >> (__p - __c); | ||
| 110 | __m ^= (__mask >> __c); | ||
| 111 | __result |= (__t); | ||
| 112 | __p++; | ||
| 113 |     } | ||
| 114 |   } | ||
| 115 | return __result; | ||
| 116 | } | ||
| 117 | |||
| 118 | /* these 32-bit implementations depend on 64-bit pdep/pext | ||
| 119 |    which depend on _ARCH_PWR7.  */ | ||
| 120 | extern __inline unsigned int | ||
| 121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 122 | _pdep_u32(unsigned int __X, unsigned int __Y) { | ||
| 123 | return _pdep_u64(__X, __Y); | ||
| 124 | } | ||
| 125 | |||
| 126 | extern __inline unsigned int | ||
| 127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) | ||
| 128 | _pext_u32(unsigned int __X, unsigned int __Y) { | ||
| 129 | return _pext_u64(__X, __Y); | ||
| 130 | } | ||
| 131 | #endif /* _ARCH_PWR7  */ | ||
| 132 | #endif /* __PPC64__  */ | ||
| 133 | |||
| 134 | #endif /* BMI2INTRIN_H_ */ |