Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 14 | pmbaty | 1 | /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== |
| 2 | * |
||
| 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
||
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
| 6 | * |
||
| 7 | *===-----------------------------------------------------------------------=== |
||
| 8 | */ |
||
| 9 | |||
| 10 | #if !defined X86GPRINTRIN_H_ |
||
| 11 | #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." |
||
| 12 | #endif |
||
| 13 | |||
| 14 | #ifndef BMI2INTRIN_H_ |
||
| 15 | #define BMI2INTRIN_H_ |
||
| 16 | |||
| 17 | extern __inline unsigned int |
||
| 18 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 19 | _bzhi_u32(unsigned int __X, unsigned int __Y) { |
||
| 20 | return ((__X << (32 - __Y)) >> (32 - __Y)); |
||
| 21 | } |
||
| 22 | |||
| 23 | extern __inline unsigned int |
||
| 24 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 25 | _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { |
||
| 26 | unsigned long long __res = (unsigned long long)__X * __Y; |
||
| 27 | *__P = (unsigned int)(__res >> 32); |
||
| 28 | return (unsigned int)__res; |
||
| 29 | } |
||
| 30 | |||
| 31 | #ifdef __PPC64__ |
||
| 32 | extern __inline unsigned long long |
||
| 33 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 34 | _bzhi_u64(unsigned long long __X, unsigned long long __Y) { |
||
| 35 | return ((__X << (64 - __Y)) >> (64 - __Y)); |
||
| 36 | } |
||
| 37 | |||
| 38 | /* __int128 requires base 64-bit. */ |
||
| 39 | extern __inline unsigned long long |
||
| 40 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 41 | _mulx_u64(unsigned long long __X, unsigned long long __Y, |
||
| 42 | unsigned long long *__P) { |
||
| 43 | unsigned __int128 __res = (unsigned __int128)__X * __Y; |
||
| 44 | *__P = (unsigned long long)(__res >> 64); |
||
| 45 | return (unsigned long long)__res; |
||
| 46 | } |
||
| 47 | |||
| 48 | #ifdef _ARCH_PWR7 |
||
| 49 | /* popcount and bpermd require power7 minimum. */ |
||
| 50 | extern __inline unsigned long long |
||
| 51 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 52 | _pdep_u64(unsigned long long __X, unsigned long long __M) { |
||
| 53 | unsigned long __result = 0x0UL; |
||
| 54 | const unsigned long __mask = 0x8000000000000000UL; |
||
| 55 | unsigned long __m = __M; |
||
| 56 | unsigned long __c, __t; |
||
| 57 | unsigned long __p; |
||
| 58 | |||
| 59 | /* The pop-count of the mask gives the number of the bits from |
||
| 60 | source to process. This is also needed to shift bits from the |
||
| 61 | source into the correct position for the result. */ |
||
| 62 | __p = 64 - __builtin_popcountl(__M); |
||
| 63 | |||
| 64 | /* The loop is for the number of '1' bits in the mask and clearing |
||
| 65 | each mask bit as it is processed. */ |
||
| 66 | while (__m != 0) { |
||
| 67 | __c = __builtin_clzl(__m); |
||
| 68 | __t = __X << (__p - __c); |
||
| 69 | __m ^= (__mask >> __c); |
||
| 70 | __result |= (__t & (__mask >> __c)); |
||
| 71 | __p++; |
||
| 72 | } |
||
| 73 | return __result; |
||
| 74 | } |
||
| 75 | |||
| 76 | extern __inline unsigned long long |
||
| 77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 78 | _pext_u64(unsigned long long __X, unsigned long long __M) { |
||
| 79 | unsigned long __p = 0x4040404040404040UL; // initial bit permute control |
||
| 80 | const unsigned long __mask = 0x8000000000000000UL; |
||
| 81 | unsigned long __m = __M; |
||
| 82 | unsigned long __c; |
||
| 83 | unsigned long __result; |
||
| 84 | |||
| 85 | /* if the mask is constant and selects 8 bits or less we can use |
||
| 86 | the Power8 Bit permute instruction. */ |
||
| 87 | if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { |
||
| 88 | /* Also if the pext mask is constant, then the popcount is |
||
| 89 | constant, we can evaluate the following loop at compile |
||
| 90 | time and use a constant bit permute vector. */ |
||
| 91 | long __i; |
||
| 92 | for (__i = 0; __i < __builtin_popcountl(__M); __i++) { |
||
| 93 | __c = __builtin_clzl(__m); |
||
| 94 | __p = (__p << 8) | __c; |
||
| 95 | __m ^= (__mask >> __c); |
||
| 96 | } |
||
| 97 | __result = __builtin_bpermd(__p, __X); |
||
| 98 | } else { |
||
| 99 | __p = 64 - __builtin_popcountl(__M); |
||
| 100 | __result = 0; |
||
| 101 | /* We could a use a for loop here, but that combined with |
||
| 102 | -funroll-loops can expand to a lot of code. The while |
||
| 103 | loop avoids unrolling and the compiler commons the xor |
||
| 104 | from clearing the mask bit with the (m != 0) test. The |
||
| 105 | result is a more compact loop setup and body. */ |
||
| 106 | while (__m != 0) { |
||
| 107 | unsigned long __t; |
||
| 108 | __c = __builtin_clzl(__m); |
||
| 109 | __t = (__X & (__mask >> __c)) >> (__p - __c); |
||
| 110 | __m ^= (__mask >> __c); |
||
| 111 | __result |= (__t); |
||
| 112 | __p++; |
||
| 113 | } |
||
| 114 | } |
||
| 115 | return __result; |
||
| 116 | } |
||
| 117 | |||
| 118 | /* these 32-bit implementations depend on 64-bit pdep/pext |
||
| 119 | which depend on _ARCH_PWR7. */ |
||
| 120 | extern __inline unsigned int |
||
| 121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 122 | _pdep_u32(unsigned int __X, unsigned int __Y) { |
||
| 123 | return _pdep_u64(__X, __Y); |
||
| 124 | } |
||
| 125 | |||
| 126 | extern __inline unsigned int |
||
| 127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
| 128 | _pext_u32(unsigned int __X, unsigned int __Y) { |
||
| 129 | return _pext_u64(__X, __Y); |
||
| 130 | } |
||
| 131 | #endif /* _ARCH_PWR7 */ |
||
| 132 | #endif /* __PPC64__ */ |
||
| 133 | |||
| 134 | #endif /* BMI2INTRIN_H_ */ |