Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | |||
10 | #if !defined X86GPRINTRIN_H_ |
||
11 | #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." |
||
12 | #endif |
||
13 | |||
14 | #ifndef BMI2INTRIN_H_ |
||
15 | #define BMI2INTRIN_H_ |
||
16 | |||
17 | extern __inline unsigned int |
||
18 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
19 | _bzhi_u32(unsigned int __X, unsigned int __Y) { |
||
20 | return ((__X << (32 - __Y)) >> (32 - __Y)); |
||
21 | } |
||
22 | |||
23 | extern __inline unsigned int |
||
24 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
25 | _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { |
||
26 | unsigned long long __res = (unsigned long long)__X * __Y; |
||
27 | *__P = (unsigned int)(__res >> 32); |
||
28 | return (unsigned int)__res; |
||
29 | } |
||
30 | |||
31 | #ifdef __PPC64__ |
||
32 | extern __inline unsigned long long |
||
33 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
34 | _bzhi_u64(unsigned long long __X, unsigned long long __Y) { |
||
35 | return ((__X << (64 - __Y)) >> (64 - __Y)); |
||
36 | } |
||
37 | |||
38 | /* __int128 requires base 64-bit. */ |
||
39 | extern __inline unsigned long long |
||
40 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
41 | _mulx_u64(unsigned long long __X, unsigned long long __Y, |
||
42 | unsigned long long *__P) { |
||
43 | unsigned __int128 __res = (unsigned __int128)__X * __Y; |
||
44 | *__P = (unsigned long long)(__res >> 64); |
||
45 | return (unsigned long long)__res; |
||
46 | } |
||
47 | |||
48 | #ifdef _ARCH_PWR7 |
||
49 | /* popcount and bpermd require power7 minimum. */ |
||
50 | extern __inline unsigned long long |
||
51 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
52 | _pdep_u64(unsigned long long __X, unsigned long long __M) { |
||
53 | unsigned long __result = 0x0UL; |
||
54 | const unsigned long __mask = 0x8000000000000000UL; |
||
55 | unsigned long __m = __M; |
||
56 | unsigned long __c, __t; |
||
57 | unsigned long __p; |
||
58 | |||
59 | /* The pop-count of the mask gives the number of the bits from |
||
60 | source to process. This is also needed to shift bits from the |
||
61 | source into the correct position for the result. */ |
||
62 | __p = 64 - __builtin_popcountl(__M); |
||
63 | |||
64 | /* The loop is for the number of '1' bits in the mask and clearing |
||
65 | each mask bit as it is processed. */ |
||
66 | while (__m != 0) { |
||
67 | __c = __builtin_clzl(__m); |
||
68 | __t = __X << (__p - __c); |
||
69 | __m ^= (__mask >> __c); |
||
70 | __result |= (__t & (__mask >> __c)); |
||
71 | __p++; |
||
72 | } |
||
73 | return __result; |
||
74 | } |
||
75 | |||
76 | extern __inline unsigned long long |
||
77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
78 | _pext_u64(unsigned long long __X, unsigned long long __M) { |
||
79 | unsigned long __p = 0x4040404040404040UL; // initial bit permute control |
||
80 | const unsigned long __mask = 0x8000000000000000UL; |
||
81 | unsigned long __m = __M; |
||
82 | unsigned long __c; |
||
83 | unsigned long __result; |
||
84 | |||
85 | /* if the mask is constant and selects 8 bits or less we can use |
||
86 | the Power8 Bit permute instruction. */ |
||
87 | if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { |
||
88 | /* Also if the pext mask is constant, then the popcount is |
||
89 | constant, we can evaluate the following loop at compile |
||
90 | time and use a constant bit permute vector. */ |
||
91 | long __i; |
||
92 | for (__i = 0; __i < __builtin_popcountl(__M); __i++) { |
||
93 | __c = __builtin_clzl(__m); |
||
94 | __p = (__p << 8) | __c; |
||
95 | __m ^= (__mask >> __c); |
||
96 | } |
||
97 | __result = __builtin_bpermd(__p, __X); |
||
98 | } else { |
||
99 | __p = 64 - __builtin_popcountl(__M); |
||
100 | __result = 0; |
||
101 | /* We could a use a for loop here, but that combined with |
||
102 | -funroll-loops can expand to a lot of code. The while |
||
103 | loop avoids unrolling and the compiler commons the xor |
||
104 | from clearing the mask bit with the (m != 0) test. The |
||
105 | result is a more compact loop setup and body. */ |
||
106 | while (__m != 0) { |
||
107 | unsigned long __t; |
||
108 | __c = __builtin_clzl(__m); |
||
109 | __t = (__X & (__mask >> __c)) >> (__p - __c); |
||
110 | __m ^= (__mask >> __c); |
||
111 | __result |= (__t); |
||
112 | __p++; |
||
113 | } |
||
114 | } |
||
115 | return __result; |
||
116 | } |
||
117 | |||
118 | /* these 32-bit implementations depend on 64-bit pdep/pext |
||
119 | which depend on _ARCH_PWR7. */ |
||
120 | extern __inline unsigned int |
||
121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
122 | _pdep_u32(unsigned int __X, unsigned int __Y) { |
||
123 | return _pdep_u64(__X, __Y); |
||
124 | } |
||
125 | |||
126 | extern __inline unsigned int |
||
127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
128 | _pext_u32(unsigned int __X, unsigned int __Y) { |
||
129 | return _pext_u64(__X, __Y); |
||
130 | } |
||
131 | #endif /* _ARCH_PWR7 */ |
||
132 | #endif /* __PPC64__ */ |
||
133 | |||
134 | #endif /* BMI2INTRIN_H_ */ |