Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
||
10 | #define __CLANG_CUDA_INTRINSICS_H__ |
||
11 | #ifndef __CUDA__ |
||
12 | #error "This file is for CUDA compilation only." |
||
13 | #endif |
||
14 | |||
15 | // sm_30 intrinsics: __shfl_{up,down,xor}. |
||
16 | |||
17 | #define __SM_30_INTRINSICS_H__ |
||
18 | #define __SM_30_INTRINSICS_HPP__ |
||
19 | |||
20 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
||
21 | |||
22 | #pragma push_macro("__MAKE_SHUFFLES") |
||
23 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
||
24 | __Type) \ |
||
25 | inline __device__ int __FnName(int __val, __Type __offset, \ |
||
26 | int __width = warpSize) { \ |
||
27 | return __IntIntrinsic(__val, __offset, \ |
||
28 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
29 | } \ |
||
30 | inline __device__ float __FnName(float __val, __Type __offset, \ |
||
31 | int __width = warpSize) { \ |
||
32 | return __FloatIntrinsic(__val, __offset, \ |
||
33 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
34 | } \ |
||
35 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
||
36 | int __width = warpSize) { \ |
||
37 | return static_cast<unsigned int>( \ |
||
38 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
||
39 | } \ |
||
40 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
||
41 | int __width = warpSize) { \ |
||
42 | struct __Bits { \ |
||
43 | int __a, __b; \ |
||
44 | }; \ |
||
45 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
||
46 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
||
47 | __Bits __tmp; \ |
||
48 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
49 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
||
50 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
||
51 | long long __ret; \ |
||
52 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
||
53 | return __ret; \ |
||
54 | } \ |
||
55 | inline __device__ long __FnName(long __val, __Type __offset, \ |
||
56 | int __width = warpSize) { \ |
||
57 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
||
58 | sizeof(long) == sizeof(int)); \ |
||
59 | if (sizeof(long) == sizeof(long long)) { \ |
||
60 | return static_cast<long>( \ |
||
61 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
||
62 | } else if (sizeof(long) == sizeof(int)) { \ |
||
63 | return static_cast<long>( \ |
||
64 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
||
65 | } \ |
||
66 | } \ |
||
67 | inline __device__ unsigned long __FnName( \ |
||
68 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
||
69 | return static_cast<unsigned long>( \ |
||
70 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
||
71 | } \ |
||
72 | inline __device__ unsigned long long __FnName( \ |
||
73 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
||
74 | return static_cast<unsigned long long>( \ |
||
75 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
||
76 | } \ |
||
77 | inline __device__ double __FnName(double __val, __Type __offset, \ |
||
78 | int __width = warpSize) { \ |
||
79 | long long __tmp; \ |
||
80 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
||
81 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
82 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
||
83 | double __ret; \ |
||
84 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
||
85 | return __ret; \ |
||
86 | } |
||
87 | |||
88 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
||
89 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
||
90 | // maxLane. |
||
91 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
||
92 | unsigned int); |
||
93 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
||
94 | unsigned int); |
||
95 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
||
96 | int); |
||
97 | #pragma pop_macro("__MAKE_SHUFFLES") |
||
98 | |||
99 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
||
100 | |||
101 | #if CUDA_VERSION >= 9000 |
||
102 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
||
103 | // __shfl_sync_* variants available in CUDA-9 |
||
104 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
||
105 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
||
106 | __Mask, __Type) \ |
||
107 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
||
108 | __Type __offset, int __width = warpSize) { \ |
||
109 | return __IntIntrinsic(__mask, __val, __offset, \ |
||
110 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
111 | } \ |
||
112 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
||
113 | __Type __offset, int __width = warpSize) { \ |
||
114 | return __FloatIntrinsic(__mask, __val, __offset, \ |
||
115 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
116 | } \ |
||
117 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
||
118 | unsigned int __val, __Type __offset, \ |
||
119 | int __width = warpSize) { \ |
||
120 | return static_cast<unsigned int>( \ |
||
121 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
||
122 | } \ |
||
123 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
||
124 | __Type __offset, \ |
||
125 | int __width = warpSize) { \ |
||
126 | struct __Bits { \ |
||
127 | int __a, __b; \ |
||
128 | }; \ |
||
129 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
||
130 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
||
131 | __Bits __tmp; \ |
||
132 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
133 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
||
134 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
||
135 | long long __ret; \ |
||
136 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
||
137 | return __ret; \ |
||
138 | } \ |
||
139 | inline __device__ unsigned long long __FnName( \ |
||
140 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
||
141 | int __width = warpSize) { \ |
||
142 | return static_cast<unsigned long long>( \ |
||
143 | ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \ |
||
144 | } \ |
||
145 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
||
146 | __Type __offset, int __width = warpSize) { \ |
||
147 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
||
148 | sizeof(long) == sizeof(int)); \ |
||
149 | if (sizeof(long) == sizeof(long long)) { \ |
||
150 | return static_cast<long>(::__FnName( \ |
||
151 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
||
152 | } else if (sizeof(long) == sizeof(int)) { \ |
||
153 | return static_cast<long>( \ |
||
154 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
||
155 | } \ |
||
156 | } \ |
||
157 | inline __device__ unsigned long __FnName( \ |
||
158 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
||
159 | int __width = warpSize) { \ |
||
160 | return static_cast<unsigned long>( \ |
||
161 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
||
162 | } \ |
||
163 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
||
164 | __Type __offset, int __width = warpSize) { \ |
||
165 | long long __tmp; \ |
||
166 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
||
167 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
168 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
||
169 | double __ret; \ |
||
170 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
||
171 | return __ret; \ |
||
172 | } |
||
173 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
||
174 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
||
175 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
||
176 | // maxLane. |
||
177 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
||
178 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
||
179 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
||
180 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
||
181 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
||
182 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
||
183 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
||
184 | |||
185 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
||
186 | return __nvvm_bar_warp_sync(mask); |
||
187 | } |
||
188 | |||
189 | inline __device__ void __barrier_sync(unsigned int id) { |
||
190 | __nvvm_barrier_sync(id); |
||
191 | } |
||
192 | |||
193 | inline __device__ void __barrier_sync_count(unsigned int id, |
||
194 | unsigned int count) { |
||
195 | __nvvm_barrier_sync_cnt(id, count); |
||
196 | } |
||
197 | |||
198 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
||
199 | return __nvvm_vote_all_sync(mask, pred); |
||
200 | } |
||
201 | |||
202 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
||
203 | return __nvvm_vote_any_sync(mask, pred); |
||
204 | } |
||
205 | |||
206 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
||
207 | return __nvvm_vote_uni_sync(mask, pred); |
||
208 | } |
||
209 | |||
210 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
||
211 | return __nvvm_vote_ballot_sync(mask, pred); |
||
212 | } |
||
213 | |||
214 | inline __device__ unsigned int __activemask() { |
||
215 | #if CUDA_VERSION < 9020 |
||
216 | return __nvvm_vote_ballot(1); |
||
217 | #else |
||
218 | unsigned int mask; |
||
219 | asm volatile("activemask.b32 %0;" : "=r"(mask)); |
||
220 | return mask; |
||
221 | #endif |
||
222 | } |
||
223 | |||
224 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
||
225 | return __nvvm_fns(mask, base, offset); |
||
226 | } |
||
227 | |||
228 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
||
229 | |||
230 | // Define __match* builtins CUDA-9 headers expect to see. |
||
231 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
||
232 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
||
233 | unsigned int value) { |
||
234 | return __nvvm_match_any_sync_i32(mask, value); |
||
235 | } |
||
236 | |||
237 | inline __device__ unsigned int |
||
238 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
||
239 | return __nvvm_match_any_sync_i64(mask, value); |
||
240 | } |
||
241 | |||
242 | inline __device__ unsigned int |
||
243 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
||
244 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
||
245 | } |
||
246 | |||
247 | inline __device__ unsigned int |
||
248 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
||
249 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
||
250 | } |
||
251 | #include "crt/sm_70_rt.hpp" |
||
252 | |||
253 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
||
254 | #endif // __CUDA_VERSION >= 9000 |
||
255 | |||
256 | // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. |
||
257 | |||
258 | // Prevent the vanilla sm_32 intrinsics header from being included. |
||
259 | #define __SM_32_INTRINSICS_H__ |
||
260 | #define __SM_32_INTRINSICS_HPP__ |
||
261 | |||
262 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
||
263 | |||
264 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
||
265 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
||
266 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
||
267 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
||
268 | inline __device__ long long __ldg(const long long *ptr) { |
||
269 | return __nvvm_ldg_ll(ptr); |
||
270 | } |
||
271 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
||
272 | return __nvvm_ldg_uc(ptr); |
||
273 | } |
||
274 | inline __device__ signed char __ldg(const signed char *ptr) { |
||
275 | return __nvvm_ldg_uc((const unsigned char *)ptr); |
||
276 | } |
||
277 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
||
278 | return __nvvm_ldg_us(ptr); |
||
279 | } |
||
280 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
||
281 | return __nvvm_ldg_ui(ptr); |
||
282 | } |
||
283 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
||
284 | return __nvvm_ldg_ul(ptr); |
||
285 | } |
||
286 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
||
287 | return __nvvm_ldg_ull(ptr); |
||
288 | } |
||
289 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
||
290 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
||
291 | |||
292 | inline __device__ char2 __ldg(const char2 *ptr) { |
||
293 | typedef char c2 __attribute__((ext_vector_type(2))); |
||
294 | // We can assume that ptr is aligned at least to char2's alignment, but the |
||
295 | // load will assume that ptr is aligned to char2's alignment. This is only |
||
296 | // safe if alignof(c2) <= alignof(char2). |
||
297 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
||
298 | char2 ret; |
||
299 | ret.x = rv[0]; |
||
300 | ret.y = rv[1]; |
||
301 | return ret; |
||
302 | } |
||
303 | inline __device__ char4 __ldg(const char4 *ptr) { |
||
304 | typedef char c4 __attribute__((ext_vector_type(4))); |
||
305 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
||
306 | char4 ret; |
||
307 | ret.x = rv[0]; |
||
308 | ret.y = rv[1]; |
||
309 | ret.z = rv[2]; |
||
310 | ret.w = rv[3]; |
||
311 | return ret; |
||
312 | } |
||
313 | inline __device__ short2 __ldg(const short2 *ptr) { |
||
314 | typedef short s2 __attribute__((ext_vector_type(2))); |
||
315 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
||
316 | short2 ret; |
||
317 | ret.x = rv[0]; |
||
318 | ret.y = rv[1]; |
||
319 | return ret; |
||
320 | } |
||
321 | inline __device__ short4 __ldg(const short4 *ptr) { |
||
322 | typedef short s4 __attribute__((ext_vector_type(4))); |
||
323 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
||
324 | short4 ret; |
||
325 | ret.x = rv[0]; |
||
326 | ret.y = rv[1]; |
||
327 | ret.z = rv[2]; |
||
328 | ret.w = rv[3]; |
||
329 | return ret; |
||
330 | } |
||
331 | inline __device__ int2 __ldg(const int2 *ptr) { |
||
332 | typedef int i2 __attribute__((ext_vector_type(2))); |
||
333 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
||
334 | int2 ret; |
||
335 | ret.x = rv[0]; |
||
336 | ret.y = rv[1]; |
||
337 | return ret; |
||
338 | } |
||
339 | inline __device__ int4 __ldg(const int4 *ptr) { |
||
340 | typedef int i4 __attribute__((ext_vector_type(4))); |
||
341 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
||
342 | int4 ret; |
||
343 | ret.x = rv[0]; |
||
344 | ret.y = rv[1]; |
||
345 | ret.z = rv[2]; |
||
346 | ret.w = rv[3]; |
||
347 | return ret; |
||
348 | } |
||
349 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
||
350 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
||
351 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
||
352 | longlong2 ret; |
||
353 | ret.x = rv[0]; |
||
354 | ret.y = rv[1]; |
||
355 | return ret; |
||
356 | } |
||
357 | |||
358 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
||
359 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
||
360 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
||
361 | uchar2 ret; |
||
362 | ret.x = rv[0]; |
||
363 | ret.y = rv[1]; |
||
364 | return ret; |
||
365 | } |
||
366 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
||
367 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
||
368 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
||
369 | uchar4 ret; |
||
370 | ret.x = rv[0]; |
||
371 | ret.y = rv[1]; |
||
372 | ret.z = rv[2]; |
||
373 | ret.w = rv[3]; |
||
374 | return ret; |
||
375 | } |
||
376 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
||
377 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
||
378 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
||
379 | ushort2 ret; |
||
380 | ret.x = rv[0]; |
||
381 | ret.y = rv[1]; |
||
382 | return ret; |
||
383 | } |
||
384 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
||
385 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
||
386 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
||
387 | ushort4 ret; |
||
388 | ret.x = rv[0]; |
||
389 | ret.y = rv[1]; |
||
390 | ret.z = rv[2]; |
||
391 | ret.w = rv[3]; |
||
392 | return ret; |
||
393 | } |
||
394 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
||
395 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
||
396 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
||
397 | uint2 ret; |
||
398 | ret.x = rv[0]; |
||
399 | ret.y = rv[1]; |
||
400 | return ret; |
||
401 | } |
||
402 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
||
403 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
||
404 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
||
405 | uint4 ret; |
||
406 | ret.x = rv[0]; |
||
407 | ret.y = rv[1]; |
||
408 | ret.z = rv[2]; |
||
409 | ret.w = rv[3]; |
||
410 | return ret; |
||
411 | } |
||
412 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
||
413 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
||
414 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
||
415 | ulonglong2 ret; |
||
416 | ret.x = rv[0]; |
||
417 | ret.y = rv[1]; |
||
418 | return ret; |
||
419 | } |
||
420 | |||
421 | inline __device__ float2 __ldg(const float2 *ptr) { |
||
422 | typedef float f2 __attribute__((ext_vector_type(2))); |
||
423 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
||
424 | float2 ret; |
||
425 | ret.x = rv[0]; |
||
426 | ret.y = rv[1]; |
||
427 | return ret; |
||
428 | } |
||
429 | inline __device__ float4 __ldg(const float4 *ptr) { |
||
430 | typedef float f4 __attribute__((ext_vector_type(4))); |
||
431 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
||
432 | float4 ret; |
||
433 | ret.x = rv[0]; |
||
434 | ret.y = rv[1]; |
||
435 | ret.z = rv[2]; |
||
436 | ret.w = rv[3]; |
||
437 | return ret; |
||
438 | } |
||
439 | inline __device__ double2 __ldg(const double2 *ptr) { |
||
440 | typedef double d2 __attribute__((ext_vector_type(2))); |
||
441 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
||
442 | double2 ret; |
||
443 | ret.x = rv[0]; |
||
444 | ret.y = rv[1]; |
||
445 | return ret; |
||
446 | } |
||
447 | |||
448 | // TODO: Implement these as intrinsics, so the backend can work its magic on |
||
449 | // these. Alternatively, we could implement these as plain C and try to get |
||
450 | // llvm to recognize the relevant patterns. |
||
451 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
||
452 | unsigned shiftWidth) { |
||
453 | unsigned result; |
||
454 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
||
455 | : "=r"(result) |
||
456 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
457 | return result; |
||
458 | } |
||
459 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
||
460 | unsigned shiftWidth) { |
||
461 | unsigned result; |
||
462 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
||
463 | : "=r"(result) |
||
464 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
465 | return result; |
||
466 | } |
||
467 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
||
468 | unsigned shiftWidth) { |
||
469 | unsigned result; |
||
470 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
||
471 | : "=r"(result) |
||
472 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
473 | return result; |
||
474 | } |
||
475 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
||
476 | unsigned shiftWidth) { |
||
477 | unsigned ret; |
||
478 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
||
479 | : "=r"(ret) |
||
480 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
481 | return ret; |
||
482 | } |
||
483 | |||
484 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
||
485 | |||
486 | #if CUDA_VERSION >= 11000 |
||
487 | extern "C" { |
||
488 | __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) { |
||
489 | return (size_t)(void __attribute__((address_space(1))) *)__ptr; |
||
490 | } |
||
491 | __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) { |
||
492 | return (size_t)(void __attribute__((address_space(3))) *)__ptr; |
||
493 | } |
||
494 | __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) { |
||
495 | return (size_t)(void __attribute__((address_space(4))) *)__ptr; |
||
496 | } |
||
497 | __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) { |
||
498 | return (size_t)(void __attribute__((address_space(5))) *)__ptr; |
||
499 | } |
||
500 | __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) { |
||
501 | return (void *)(void __attribute__((address_space(1))) *)__ptr; |
||
502 | } |
||
503 | __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) { |
||
504 | return (void *)(void __attribute__((address_space(3))) *)__ptr; |
||
505 | } |
||
506 | __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) { |
||
507 | return (void *)(void __attribute__((address_space(4))) *)__ptr; |
||
508 | } |
||
509 | __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) { |
||
510 | return (void *)(void __attribute__((address_space(5))) *)__ptr; |
||
511 | } |
||
512 | __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { |
||
513 | return __nv_cvta_generic_to_shared_impl(__ptr); |
||
514 | } |
||
515 | } // extern "C" |
||
516 | #endif // CUDA_VERSION >= 11000 |
||
517 | |||
518 | #endif // defined(__CLANG_CUDA_INTRINSICS_H__) |