Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 14 | pmbaty | 1 | /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== |
| 2 | * |
||
| 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
||
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
| 6 | * |
||
| 7 | *===-----------------------------------------------------------------------=== |
||
| 8 | */ |
||
| 9 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
||
| 10 | #define __CLANG_CUDA_INTRINSICS_H__ |
||
| 11 | #ifndef __CUDA__ |
||
| 12 | #error "This file is for CUDA compilation only." |
||
| 13 | #endif |
||
| 14 | |||
| 15 | // sm_30 intrinsics: __shfl_{up,down,xor}. |
||
| 16 | |||
| 17 | #define __SM_30_INTRINSICS_H__ |
||
| 18 | #define __SM_30_INTRINSICS_HPP__ |
||
| 19 | |||
| 20 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
||
| 21 | |||
| 22 | #pragma push_macro("__MAKE_SHUFFLES") |
||
| 23 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
||
| 24 | __Type) \ |
||
| 25 | inline __device__ int __FnName(int __val, __Type __offset, \ |
||
| 26 | int __width = warpSize) { \ |
||
| 27 | return __IntIntrinsic(__val, __offset, \ |
||
| 28 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
| 29 | } \ |
||
| 30 | inline __device__ float __FnName(float __val, __Type __offset, \ |
||
| 31 | int __width = warpSize) { \ |
||
| 32 | return __FloatIntrinsic(__val, __offset, \ |
||
| 33 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
| 34 | } \ |
||
| 35 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
||
| 36 | int __width = warpSize) { \ |
||
| 37 | return static_cast<unsigned int>( \ |
||
| 38 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
||
| 39 | } \ |
||
| 40 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
||
| 41 | int __width = warpSize) { \ |
||
| 42 | struct __Bits { \ |
||
| 43 | int __a, __b; \ |
||
| 44 | }; \ |
||
| 45 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
||
| 46 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
||
| 47 | __Bits __tmp; \ |
||
| 48 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
| 49 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
||
| 50 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
||
| 51 | long long __ret; \ |
||
| 52 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
||
| 53 | return __ret; \ |
||
| 54 | } \ |
||
| 55 | inline __device__ long __FnName(long __val, __Type __offset, \ |
||
| 56 | int __width = warpSize) { \ |
||
| 57 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
||
| 58 | sizeof(long) == sizeof(int)); \ |
||
| 59 | if (sizeof(long) == sizeof(long long)) { \ |
||
| 60 | return static_cast<long>( \ |
||
| 61 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
||
| 62 | } else if (sizeof(long) == sizeof(int)) { \ |
||
| 63 | return static_cast<long>( \ |
||
| 64 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
||
| 65 | } \ |
||
| 66 | } \ |
||
| 67 | inline __device__ unsigned long __FnName( \ |
||
| 68 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
||
| 69 | return static_cast<unsigned long>( \ |
||
| 70 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
||
| 71 | } \ |
||
| 72 | inline __device__ unsigned long long __FnName( \ |
||
| 73 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
||
| 74 | return static_cast<unsigned long long>( \ |
||
| 75 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
||
| 76 | } \ |
||
| 77 | inline __device__ double __FnName(double __val, __Type __offset, \ |
||
| 78 | int __width = warpSize) { \ |
||
| 79 | long long __tmp; \ |
||
| 80 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
||
| 81 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
| 82 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
||
| 83 | double __ret; \ |
||
| 84 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
||
| 85 | return __ret; \ |
||
| 86 | } |
||
| 87 | |||
| 88 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
||
| 89 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
||
| 90 | // maxLane. |
||
| 91 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
||
| 92 | unsigned int); |
||
| 93 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
||
| 94 | unsigned int); |
||
| 95 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
||
| 96 | int); |
||
| 97 | #pragma pop_macro("__MAKE_SHUFFLES") |
||
| 98 | |||
| 99 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
||
| 100 | |||
| 101 | #if CUDA_VERSION >= 9000 |
||
| 102 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
||
| 103 | // __shfl_sync_* variants available in CUDA-9 |
||
| 104 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
||
| 105 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
||
| 106 | __Mask, __Type) \ |
||
| 107 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
||
| 108 | __Type __offset, int __width = warpSize) { \ |
||
| 109 | return __IntIntrinsic(__mask, __val, __offset, \ |
||
| 110 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
| 111 | } \ |
||
| 112 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
||
| 113 | __Type __offset, int __width = warpSize) { \ |
||
| 114 | return __FloatIntrinsic(__mask, __val, __offset, \ |
||
| 115 | ((warpSize - __width) << 8) | (__Mask)); \ |
||
| 116 | } \ |
||
| 117 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
||
| 118 | unsigned int __val, __Type __offset, \ |
||
| 119 | int __width = warpSize) { \ |
||
| 120 | return static_cast<unsigned int>( \ |
||
| 121 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
||
| 122 | } \ |
||
| 123 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
||
| 124 | __Type __offset, \ |
||
| 125 | int __width = warpSize) { \ |
||
| 126 | struct __Bits { \ |
||
| 127 | int __a, __b; \ |
||
| 128 | }; \ |
||
| 129 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
||
| 130 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
||
| 131 | __Bits __tmp; \ |
||
| 132 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
| 133 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
||
| 134 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
||
| 135 | long long __ret; \ |
||
| 136 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
||
| 137 | return __ret; \ |
||
| 138 | } \ |
||
| 139 | inline __device__ unsigned long long __FnName( \ |
||
| 140 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
||
| 141 | int __width = warpSize) { \ |
||
| 142 | return static_cast<unsigned long long>( \ |
||
| 143 | ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \ |
||
| 144 | } \ |
||
| 145 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
||
| 146 | __Type __offset, int __width = warpSize) { \ |
||
| 147 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
||
| 148 | sizeof(long) == sizeof(int)); \ |
||
| 149 | if (sizeof(long) == sizeof(long long)) { \ |
||
| 150 | return static_cast<long>(::__FnName( \ |
||
| 151 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
||
| 152 | } else if (sizeof(long) == sizeof(int)) { \ |
||
| 153 | return static_cast<long>( \ |
||
| 154 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
||
| 155 | } \ |
||
| 156 | } \ |
||
| 157 | inline __device__ unsigned long __FnName( \ |
||
| 158 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
||
| 159 | int __width = warpSize) { \ |
||
| 160 | return static_cast<unsigned long>( \ |
||
| 161 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
||
| 162 | } \ |
||
| 163 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
||
| 164 | __Type __offset, int __width = warpSize) { \ |
||
| 165 | long long __tmp; \ |
||
| 166 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
||
| 167 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
||
| 168 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
||
| 169 | double __ret; \ |
||
| 170 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
||
| 171 | return __ret; \ |
||
| 172 | } |
||
| 173 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
||
| 174 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
||
| 175 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
||
| 176 | // maxLane. |
||
| 177 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
||
| 178 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
||
| 179 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
||
| 180 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
||
| 181 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
||
| 182 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
||
| 183 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
||
| 184 | |||
| 185 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
||
| 186 | return __nvvm_bar_warp_sync(mask); |
||
| 187 | } |
||
| 188 | |||
| 189 | inline __device__ void __barrier_sync(unsigned int id) { |
||
| 190 | __nvvm_barrier_sync(id); |
||
| 191 | } |
||
| 192 | |||
| 193 | inline __device__ void __barrier_sync_count(unsigned int id, |
||
| 194 | unsigned int count) { |
||
| 195 | __nvvm_barrier_sync_cnt(id, count); |
||
| 196 | } |
||
| 197 | |||
| 198 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
||
| 199 | return __nvvm_vote_all_sync(mask, pred); |
||
| 200 | } |
||
| 201 | |||
| 202 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
||
| 203 | return __nvvm_vote_any_sync(mask, pred); |
||
| 204 | } |
||
| 205 | |||
| 206 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
||
| 207 | return __nvvm_vote_uni_sync(mask, pred); |
||
| 208 | } |
||
| 209 | |||
| 210 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
||
| 211 | return __nvvm_vote_ballot_sync(mask, pred); |
||
| 212 | } |
||
| 213 | |||
| 214 | inline __device__ unsigned int __activemask() { |
||
| 215 | #if CUDA_VERSION < 9020 |
||
| 216 | return __nvvm_vote_ballot(1); |
||
| 217 | #else |
||
| 218 | unsigned int mask; |
||
| 219 | asm volatile("activemask.b32 %0;" : "=r"(mask)); |
||
| 220 | return mask; |
||
| 221 | #endif |
||
| 222 | } |
||
| 223 | |||
| 224 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
||
| 225 | return __nvvm_fns(mask, base, offset); |
||
| 226 | } |
||
| 227 | |||
| 228 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
||
| 229 | |||
| 230 | // Define __match* builtins CUDA-9 headers expect to see. |
||
| 231 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
||
| 232 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
||
| 233 | unsigned int value) { |
||
| 234 | return __nvvm_match_any_sync_i32(mask, value); |
||
| 235 | } |
||
| 236 | |||
| 237 | inline __device__ unsigned int |
||
| 238 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
||
| 239 | return __nvvm_match_any_sync_i64(mask, value); |
||
| 240 | } |
||
| 241 | |||
| 242 | inline __device__ unsigned int |
||
| 243 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
||
| 244 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
||
| 245 | } |
||
| 246 | |||
| 247 | inline __device__ unsigned int |
||
| 248 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
||
| 249 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
||
| 250 | } |
||
| 251 | #include "crt/sm_70_rt.hpp" |
||
| 252 | |||
| 253 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
||
| 254 | #endif // __CUDA_VERSION >= 9000 |
||
| 255 | |||
| 256 | // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. |
||
| 257 | |||
| 258 | // Prevent the vanilla sm_32 intrinsics header from being included. |
||
| 259 | #define __SM_32_INTRINSICS_H__ |
||
| 260 | #define __SM_32_INTRINSICS_HPP__ |
||
| 261 | |||
| 262 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
||
| 263 | |||
| 264 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
||
| 265 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
||
| 266 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
||
| 267 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
||
| 268 | inline __device__ long long __ldg(const long long *ptr) { |
||
| 269 | return __nvvm_ldg_ll(ptr); |
||
| 270 | } |
||
| 271 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
||
| 272 | return __nvvm_ldg_uc(ptr); |
||
| 273 | } |
||
| 274 | inline __device__ signed char __ldg(const signed char *ptr) { |
||
| 275 | return __nvvm_ldg_uc((const unsigned char *)ptr); |
||
| 276 | } |
||
| 277 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
||
| 278 | return __nvvm_ldg_us(ptr); |
||
| 279 | } |
||
| 280 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
||
| 281 | return __nvvm_ldg_ui(ptr); |
||
| 282 | } |
||
| 283 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
||
| 284 | return __nvvm_ldg_ul(ptr); |
||
| 285 | } |
||
| 286 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
||
| 287 | return __nvvm_ldg_ull(ptr); |
||
| 288 | } |
||
| 289 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
||
| 290 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
||
| 291 | |||
| 292 | inline __device__ char2 __ldg(const char2 *ptr) { |
||
| 293 | typedef char c2 __attribute__((ext_vector_type(2))); |
||
| 294 | // We can assume that ptr is aligned at least to char2's alignment, but the |
||
| 295 | // load will assume that ptr is aligned to char2's alignment. This is only |
||
| 296 | // safe if alignof(c2) <= alignof(char2). |
||
| 297 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
||
| 298 | char2 ret; |
||
| 299 | ret.x = rv[0]; |
||
| 300 | ret.y = rv[1]; |
||
| 301 | return ret; |
||
| 302 | } |
||
| 303 | inline __device__ char4 __ldg(const char4 *ptr) { |
||
| 304 | typedef char c4 __attribute__((ext_vector_type(4))); |
||
| 305 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
||
| 306 | char4 ret; |
||
| 307 | ret.x = rv[0]; |
||
| 308 | ret.y = rv[1]; |
||
| 309 | ret.z = rv[2]; |
||
| 310 | ret.w = rv[3]; |
||
| 311 | return ret; |
||
| 312 | } |
||
| 313 | inline __device__ short2 __ldg(const short2 *ptr) { |
||
| 314 | typedef short s2 __attribute__((ext_vector_type(2))); |
||
| 315 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
||
| 316 | short2 ret; |
||
| 317 | ret.x = rv[0]; |
||
| 318 | ret.y = rv[1]; |
||
| 319 | return ret; |
||
| 320 | } |
||
| 321 | inline __device__ short4 __ldg(const short4 *ptr) { |
||
| 322 | typedef short s4 __attribute__((ext_vector_type(4))); |
||
| 323 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
||
| 324 | short4 ret; |
||
| 325 | ret.x = rv[0]; |
||
| 326 | ret.y = rv[1]; |
||
| 327 | ret.z = rv[2]; |
||
| 328 | ret.w = rv[3]; |
||
| 329 | return ret; |
||
| 330 | } |
||
| 331 | inline __device__ int2 __ldg(const int2 *ptr) { |
||
| 332 | typedef int i2 __attribute__((ext_vector_type(2))); |
||
| 333 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
||
| 334 | int2 ret; |
||
| 335 | ret.x = rv[0]; |
||
| 336 | ret.y = rv[1]; |
||
| 337 | return ret; |
||
| 338 | } |
||
| 339 | inline __device__ int4 __ldg(const int4 *ptr) { |
||
| 340 | typedef int i4 __attribute__((ext_vector_type(4))); |
||
| 341 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
||
| 342 | int4 ret; |
||
| 343 | ret.x = rv[0]; |
||
| 344 | ret.y = rv[1]; |
||
| 345 | ret.z = rv[2]; |
||
| 346 | ret.w = rv[3]; |
||
| 347 | return ret; |
||
| 348 | } |
||
| 349 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
||
| 350 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
||
| 351 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
||
| 352 | longlong2 ret; |
||
| 353 | ret.x = rv[0]; |
||
| 354 | ret.y = rv[1]; |
||
| 355 | return ret; |
||
| 356 | } |
||
| 357 | |||
| 358 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
||
| 359 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
||
| 360 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
||
| 361 | uchar2 ret; |
||
| 362 | ret.x = rv[0]; |
||
| 363 | ret.y = rv[1]; |
||
| 364 | return ret; |
||
| 365 | } |
||
| 366 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
||
| 367 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
||
| 368 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
||
| 369 | uchar4 ret; |
||
| 370 | ret.x = rv[0]; |
||
| 371 | ret.y = rv[1]; |
||
| 372 | ret.z = rv[2]; |
||
| 373 | ret.w = rv[3]; |
||
| 374 | return ret; |
||
| 375 | } |
||
| 376 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
||
| 377 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
||
| 378 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
||
| 379 | ushort2 ret; |
||
| 380 | ret.x = rv[0]; |
||
| 381 | ret.y = rv[1]; |
||
| 382 | return ret; |
||
| 383 | } |
||
| 384 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
||
| 385 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
||
| 386 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
||
| 387 | ushort4 ret; |
||
| 388 | ret.x = rv[0]; |
||
| 389 | ret.y = rv[1]; |
||
| 390 | ret.z = rv[2]; |
||
| 391 | ret.w = rv[3]; |
||
| 392 | return ret; |
||
| 393 | } |
||
| 394 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
||
| 395 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
||
| 396 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
||
| 397 | uint2 ret; |
||
| 398 | ret.x = rv[0]; |
||
| 399 | ret.y = rv[1]; |
||
| 400 | return ret; |
||
| 401 | } |
||
| 402 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
||
| 403 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
||
| 404 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
||
| 405 | uint4 ret; |
||
| 406 | ret.x = rv[0]; |
||
| 407 | ret.y = rv[1]; |
||
| 408 | ret.z = rv[2]; |
||
| 409 | ret.w = rv[3]; |
||
| 410 | return ret; |
||
| 411 | } |
||
| 412 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
||
| 413 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
||
| 414 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
||
| 415 | ulonglong2 ret; |
||
| 416 | ret.x = rv[0]; |
||
| 417 | ret.y = rv[1]; |
||
| 418 | return ret; |
||
| 419 | } |
||
| 420 | |||
| 421 | inline __device__ float2 __ldg(const float2 *ptr) { |
||
| 422 | typedef float f2 __attribute__((ext_vector_type(2))); |
||
| 423 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
||
| 424 | float2 ret; |
||
| 425 | ret.x = rv[0]; |
||
| 426 | ret.y = rv[1]; |
||
| 427 | return ret; |
||
| 428 | } |
||
| 429 | inline __device__ float4 __ldg(const float4 *ptr) { |
||
| 430 | typedef float f4 __attribute__((ext_vector_type(4))); |
||
| 431 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
||
| 432 | float4 ret; |
||
| 433 | ret.x = rv[0]; |
||
| 434 | ret.y = rv[1]; |
||
| 435 | ret.z = rv[2]; |
||
| 436 | ret.w = rv[3]; |
||
| 437 | return ret; |
||
| 438 | } |
||
| 439 | inline __device__ double2 __ldg(const double2 *ptr) { |
||
| 440 | typedef double d2 __attribute__((ext_vector_type(2))); |
||
| 441 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
||
| 442 | double2 ret; |
||
| 443 | ret.x = rv[0]; |
||
| 444 | ret.y = rv[1]; |
||
| 445 | return ret; |
||
| 446 | } |
||
| 447 | |||
| 448 | // TODO: Implement these as intrinsics, so the backend can work its magic on |
||
| 449 | // these. Alternatively, we could implement these as plain C and try to get |
||
| 450 | // llvm to recognize the relevant patterns. |
||
| 451 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
||
| 452 | unsigned shiftWidth) { |
||
| 453 | unsigned result; |
||
| 454 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
||
| 455 | : "=r"(result) |
||
| 456 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
| 457 | return result; |
||
| 458 | } |
||
| 459 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
||
| 460 | unsigned shiftWidth) { |
||
| 461 | unsigned result; |
||
| 462 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
||
| 463 | : "=r"(result) |
||
| 464 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
| 465 | return result; |
||
| 466 | } |
||
| 467 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
||
| 468 | unsigned shiftWidth) { |
||
| 469 | unsigned result; |
||
| 470 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
||
| 471 | : "=r"(result) |
||
| 472 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
| 473 | return result; |
||
| 474 | } |
||
| 475 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
||
| 476 | unsigned shiftWidth) { |
||
| 477 | unsigned ret; |
||
| 478 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
||
| 479 | : "=r"(ret) |
||
| 480 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
||
| 481 | return ret; |
||
| 482 | } |
||
| 483 | |||
| 484 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
||
| 485 | |||
| 486 | #if CUDA_VERSION >= 11000 |
||
| 487 | extern "C" { |
||
| 488 | __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) { |
||
| 489 | return (size_t)(void __attribute__((address_space(1))) *)__ptr; |
||
| 490 | } |
||
| 491 | __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) { |
||
| 492 | return (size_t)(void __attribute__((address_space(3))) *)__ptr; |
||
| 493 | } |
||
| 494 | __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) { |
||
| 495 | return (size_t)(void __attribute__((address_space(4))) *)__ptr; |
||
| 496 | } |
||
| 497 | __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) { |
||
| 498 | return (size_t)(void __attribute__((address_space(5))) *)__ptr; |
||
| 499 | } |
||
| 500 | __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) { |
||
| 501 | return (void *)(void __attribute__((address_space(1))) *)__ptr; |
||
| 502 | } |
||
| 503 | __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) { |
||
| 504 | return (void *)(void __attribute__((address_space(3))) *)__ptr; |
||
| 505 | } |
||
| 506 | __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) { |
||
| 507 | return (void *)(void __attribute__((address_space(4))) *)__ptr; |
||
| 508 | } |
||
| 509 | __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) { |
||
| 510 | return (void *)(void __attribute__((address_space(5))) *)__ptr; |
||
| 511 | } |
||
| 512 | __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { |
||
| 513 | return __nv_cvta_generic_to_shared_impl(__ptr); |
||
| 514 | } |
||
| 515 | } // extern "C" |
||
| 516 | #endif // CUDA_VERSION >= 11000 |
||
| 517 | |||
| 518 | #endif // defined(__CLANG_CUDA_INTRINSICS_H__) |