Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 14 | pmbaty | 1 | /*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---=== |
| 2 | * |
||
| 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
||
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
| 6 | * |
||
| 7 | *===-----------------------------------------------------------------------=== |
||
| 8 | * |
||
| 9 | * This header provides in-header implmentations for NVCC's built-in |
||
| 10 | * __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The |
||
| 11 | * built-in is unusual as it's actually a set of function overloads that use the |
||
| 12 | * first string literal argument as one of the overload parameters. |
||
| 13 | */ |
||
| 14 | #ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__ |
||
| 15 | #define __CLANG_CUDA_TEXTURE_INTRINSICS_H__ |
||
| 16 | #ifndef __CUDA__ |
||
| 17 | #error "This file is for CUDA compilation only." |
||
| 18 | #endif |
||
| 19 | |||
| 20 | // __nv_tex_surf_handler() provided by this header as a macro. |
||
| 21 | #define __nv_tex_surf_handler(__op, __ptr, ...) \ |
||
| 22 | ::__cuda_tex::__tex_fetch< \ |
||
| 23 | ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \ |
||
| 24 | __VA_ARGS__) |
||
| 25 | |||
| 26 | #pragma push_macro("__ASM_OUT") |
||
| 27 | #pragma push_macro("__ASM_OUTP") |
||
| 28 | #pragma push_macro("__Args") |
||
| 29 | #pragma push_macro("__ID") |
||
| 30 | #pragma push_macro("__IDV") |
||
| 31 | #pragma push_macro("__IMPL_2DGATHER") |
||
| 32 | #pragma push_macro("__IMPL_ALIAS") |
||
| 33 | #pragma push_macro("__IMPL_ALIASI") |
||
| 34 | #pragma push_macro("__IMPL_F1") |
||
| 35 | #pragma push_macro("__IMPL_F3") |
||
| 36 | #pragma push_macro("__IMPL_F3N") |
||
| 37 | #pragma push_macro("__IMPL_F3S") |
||
| 38 | #pragma push_macro("__IMPL_S") |
||
| 39 | #pragma push_macro("__IMPL_S3") |
||
| 40 | #pragma push_macro("__IMPL_S3I") |
||
| 41 | #pragma push_macro("__IMPL_S3N") |
||
| 42 | #pragma push_macro("__IMPL_S3NI") |
||
| 43 | #pragma push_macro("__IMPL_S3S") |
||
| 44 | #pragma push_macro("__IMPL_S3SI") |
||
| 45 | #pragma push_macro("__IMPL_SI") |
||
| 46 | #pragma push_macro("__L") |
||
| 47 | #pragma push_macro("__STRIP_PARENS") |
||
| 48 | |||
| 49 | // Put all functions into anonymous namespace so they have internal linkage. |
||
| 50 | // The device-only function here must be internal in order to avoid ODR |
||
| 51 | // violations in case they are used from the files compiled with |
||
| 52 | // -fgpu-rdc. E.g. a library and an app using it may be built with a different |
||
| 53 | // version of this header file. |
||
| 54 | namespace { |
||
| 55 | |||
| 56 | // Put the implmentation into its own namespace so we don't pollute the TU. |
||
| 57 | namespace __cuda_tex { |
||
| 58 | |||
| 59 | // First, we need a perfect hash function and a few constexpr helper functions |
||
| 60 | // for converting a string literal into a numeric value which can be used to |
||
| 61 | // parametrize a template. We can not use string literals for that as that would |
||
| 62 | // require C++20. |
||
| 63 | // |
||
| 64 | // The hash function was generated with 'gperf' and then manually converted into |
||
| 65 | // its constexpr equivalent. |
||
| 66 | // |
||
| 67 | // NOTE: the perfect hashing scheme comes with inherent self-test. If the hash |
||
| 68 | // function has a collision for any of the texture operations, the compilation |
||
| 69 | // will fail due to an attempt to redefine a tag with the same value. If the |
||
| 70 | // header compiles, then the hash function is good enough for the job. |
||
| 71 | |||
| 72 | constexpr int __tex_len(const char *s) { |
||
| 73 | return (s[0] == 0) ? 0 |
||
| 74 | : (s[1] == 0) ? 1 |
||
| 75 | : (s[2] == 0) ? 2 |
||
| 76 | : (s[3] == 0) ? 3 |
||
| 77 | : (s[4] == 0) ? 4 |
||
| 78 | : (s[5] == 0) ? 5 |
||
| 79 | : (s[6] == 0) ? 6 |
||
| 80 | : (s[7] == 0) ? 7 |
||
| 81 | : (s[8] == 0) ? 8 |
||
| 82 | : (s[9] == 0) ? 9 |
||
| 83 | : (s[10] == 0) ? 10 |
||
| 84 | : (s[11] == 0) ? 11 |
||
| 85 | : (s[12] == 0) ? 12 |
||
| 86 | : (s[13] == 0) ? 13 |
||
| 87 | : (s[14] == 0) ? 14 |
||
| 88 | : (s[15] == 0) ? 15 |
||
| 89 | : (s[16] == 0) ? 16 |
||
| 90 | : (s[17] == 0) ? 17 |
||
| 91 | : (s[18] == 0) ? 18 |
||
| 92 | : (s[19] == 0) ? 19 |
||
| 93 | : (s[20] == 0) ? 20 |
||
| 94 | : (s[21] == 0) ? 21 |
||
| 95 | : (s[22] == 0) ? 22 |
||
| 96 | : (s[23] == 0) ? 23 |
||
| 97 | : (s[24] == 0) ? 24 |
||
| 98 | : (s[25] == 0) ? 25 |
||
| 99 | : (s[26] == 0) ? 26 |
||
| 100 | : (s[27] == 0) ? 27 |
||
| 101 | : (s[28] == 0) ? 28 |
||
| 102 | : (s[29] == 0) ? 29 |
||
| 103 | : (s[30] == 0) ? 30 |
||
| 104 | : (s[31] == 0) ? 31 |
||
| 105 | : 32; |
||
| 106 | } |
||
| 107 | |||
| 108 | constexpr int __tex_hash_map(int c) { |
||
| 109 | return (c == 49) ? 10 |
||
| 110 | : (c == 50) ? 0 |
||
| 111 | : (c == 51) ? 100 |
||
| 112 | : (c == 52) ? 30 |
||
| 113 | : (c == 67) ? 10 |
||
| 114 | : (c == 68) ? 0 |
||
| 115 | : (c == 69) ? 25 |
||
| 116 | : (c == 72) ? 70 |
||
| 117 | : (c == 77) ? 0 |
||
| 118 | : (c == 96) ? 44 |
||
| 119 | : (c == 99) ? 10 |
||
| 120 | : (c == 100) ? 5 |
||
| 121 | : (c == 101) ? 60 |
||
| 122 | : (c == 102) ? 40 |
||
| 123 | : (c == 103) ? 70 |
||
| 124 | : (c == 104) ? 25 |
||
| 125 | : (c == 112) ? 0 |
||
| 126 | : (c == 114) ? 45 |
||
| 127 | : (c == 117) ? 5 |
||
| 128 | : (c == 118) ? 85 |
||
| 129 | : (c == 120) ? 20 |
||
| 130 | : 225; |
||
| 131 | } |
||
| 132 | |||
| 133 | constexpr int __tex_op_hash(const char *str) { |
||
| 134 | return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) + |
||
| 135 | __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]); |
||
| 136 | } |
||
| 137 | |||
| 138 | // Tag type to identify particular texture operation. |
||
| 139 | template <int N> struct __Tag; |
||
| 140 | #define __ID(__op) __Tag<__tex_op_hash(__op)> |
||
| 141 | // Tags for variants of particular operation. E.g. tex2Dgather can translate |
||
| 142 | // into 4 different instructions. |
||
| 143 | #define __IDV(__op, __variant) \ |
||
| 144 | __Tag<10000 + __tex_op_hash(__op) * 100 + __variant> |
||
| 145 | |||
| 146 | // Helper classes for figuring out key data types for derived types. |
||
| 147 | // E.g. char2 has __base_t = char, __fetch_t = char4 |
||
| 148 | template <class> struct __TypeInfoT; |
||
| 149 | // Type info for the fundamental types. |
||
| 150 | template <> struct __TypeInfoT<float> { |
||
| 151 | using __base_t = float; |
||
| 152 | using __fetch_t = float4; |
||
| 153 | }; |
||
| 154 | template <> struct __TypeInfoT<char> { |
||
| 155 | using __base_t = char; |
||
| 156 | using __fetch_t = int4; |
||
| 157 | }; |
||
| 158 | template <> struct __TypeInfoT<signed char> { |
||
| 159 | using __base_t = signed char; |
||
| 160 | using __fetch_t = int4; |
||
| 161 | }; |
||
| 162 | template <> struct __TypeInfoT<unsigned char> { |
||
| 163 | using __base_t = unsigned char; |
||
| 164 | using __fetch_t = uint4; |
||
| 165 | }; |
||
| 166 | template <> struct __TypeInfoT<short> { |
||
| 167 | using __base_t = short; |
||
| 168 | using __fetch_t = int4; |
||
| 169 | }; |
||
| 170 | template <> struct __TypeInfoT<unsigned short> { |
||
| 171 | using __base_t = unsigned short; |
||
| 172 | using __fetch_t = uint4; |
||
| 173 | }; |
||
| 174 | template <> struct __TypeInfoT<int> { |
||
| 175 | using __base_t = int; |
||
| 176 | using __fetch_t = int4; |
||
| 177 | }; |
||
| 178 | template <> struct __TypeInfoT<unsigned int> { |
||
| 179 | using __base_t = unsigned int; |
||
| 180 | using __fetch_t = uint4; |
||
| 181 | }; |
||
| 182 | |||
| 183 | // Derived base/fetch types for N-element vectors. |
||
| 184 | template <class __T> struct __TypeInfoT { |
||
| 185 | using __base_t = decltype(__T::x); |
||
| 186 | using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t; |
||
| 187 | }; |
||
| 188 | |||
| 189 | // Classes that implement specific texture ops. |
||
| 190 | template <class __op> struct __tex_fetch_v4; |
||
| 191 | |||
| 192 | // Helper macros to strip parens from a macro argument. |
||
| 193 | #define __Args(...) __VA_ARGS__ |
||
| 194 | #define __STRIP_PARENS(__X) __X |
||
| 195 | #define __L(__X) __STRIP_PARENS(__Args __X) |
||
| 196 | |||
| 197 | // Construct inline assembly output args. |
||
| 198 | // Results are stored in a temp var __r. |
||
| 199 | // isResident bool is pointed to by __ir |
||
| 200 | // Asm args for return values. It's a 4-element vector |
||
| 201 | #define __ASM_OUT(__t) \ |
||
| 202 | ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w)) |
||
| 203 | // .. possibly combined with a predicate. |
||
| 204 | #define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir)) |
||
| 205 | |||
| 206 | // Implements a single variant of texture fetch instruction. |
||
| 207 | #define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \ |
||
| 208 | template <> \ |
||
| 209 | __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \ |
||
| 210 | __rt __r; \ |
||
| 211 | asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \ |
||
| 212 | return __r; \ |
||
| 213 | } |
||
| 214 | |||
| 215 | // Implements texture fetch instructions for int4/uint4/float4 data types. |
||
| 216 | #define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 217 | __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ |
||
| 218 | __ASM_OUT("r"), __asm_args) \ |
||
| 219 | __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \ |
||
| 220 | __ASM_OUT("r"), __asm_args) \ |
||
| 221 | __IMPL_F1(float4, float4, __args, \ |
||
| 222 | __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \ |
||
| 223 | __asm_args) |
||
| 224 | // Implements 'sparse' texture fetch instructions for int4/uint4/float4 data |
||
| 225 | // types. Similar to above, but returns a boolean 'isPresent' value in addition |
||
| 226 | // to texture data, |
||
| 227 | #define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 228 | __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ |
||
| 229 | __ASM_OUTP("r"), __asm_args) \ |
||
| 230 | __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \ |
||
| 231 | __ASM_OUTP("r"), __asm_args) \ |
||
| 232 | __IMPL_F1(float4, float4, __args, \ |
||
| 233 | __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \ |
||
| 234 | __asm_args) |
||
| 235 | |||
| 236 | // Similar to F3, but for integer data which is returned as normalized floats. |
||
| 237 | // Only instantiates fetch functions for int4/uint4. |
||
| 238 | #define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 239 | __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \ |
||
| 240 | __ASM_OUT("r"), __asm_args) \ |
||
| 241 | __IMPL_F1(float4, uint4, __args, \ |
||
| 242 | __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \ |
||
| 243 | __asm_args) |
||
| 244 | |||
| 245 | // Instantiates __tex_fetch_v4 with regular fetch functions. |
||
| 246 | #define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 247 | template <> struct __tex_fetch_v4<__op> { \ |
||
| 248 | template <class T> \ |
||
| 249 | __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \ |
||
| 250 | __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 251 | } |
||
| 252 | |||
| 253 | // Same, but for sparse ops. Only available on sm_60+ |
||
| 254 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) |
||
| 255 | #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \ |
||
| 256 | __asm_args) \ |
||
| 257 | template <> struct __tex_fetch_v4<__op> { \ |
||
| 258 | template <class T> \ |
||
| 259 | __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \ |
||
| 260 | __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 261 | } |
||
| 262 | #else |
||
| 263 | #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) |
||
| 264 | #endif |
||
| 265 | |||
| 266 | // Same, but for normalized float ops. |
||
| 267 | #define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \ |
||
| 268 | __asm_args) \ |
||
| 269 | template <> struct __tex_fetch_v4<__op> { \ |
||
| 270 | template <class T> \ |
||
| 271 | __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \ |
||
| 272 | __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 273 | } |
||
| 274 | |||
| 275 | // Regular and normalized float ops share a lot of similarities. This macro |
||
| 276 | // instantiates both variants -- normal for __op and normalized for __opn. |
||
| 277 | #define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \ |
||
| 278 | __asm_args) \ |
||
| 279 | __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \ |
||
| 280 | __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args) |
||
| 281 | |||
| 282 | // Convenience macros which converts string literal __op into a __Tag, |
||
| 283 | #define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 284 | __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) |
||
| 285 | #define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 286 | __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) |
||
| 287 | #define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \ |
||
| 288 | __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args) |
||
| 289 | #define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \ |
||
| 290 | __asm_args) \ |
||
| 291 | __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \ |
||
| 292 | __asm_args) |
||
| 293 | |||
| 294 | // CUDA headers have some 'legacy' texture oprerations that duplicate |
||
| 295 | // functionality. So, we just inherit it, instead of refining a copy. |
||
| 296 | #define __IMPL_ALIASI(__op, __opn) \ |
||
| 297 | template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {} |
||
| 298 | #define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn)) |
||
| 299 | |||
| 300 | // Now we can instantiate everything we need for each specific texture fetch |
||
| 301 | // variant. |
||
| 302 | __IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32", |
||
| 303 | "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x))); |
||
| 304 | __IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4", |
||
| 305 | "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x))); |
||
| 306 | __IMPL_ALIAS("__itex1D", "__tex1D_v2"); |
||
| 307 | __IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2"); |
||
| 308 | |||
| 309 | __IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2", |
||
| 310 | (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32", |
||
| 311 | "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};", |
||
| 312 | ("f"(__x), "f"(__dPdx), "f"(__dPdy))); |
||
| 313 | __IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2"); |
||
| 314 | |||
| 315 | __IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2", |
||
| 316 | (float __x, int __layer), "tex.a1d.v4", "f32", |
||
| 317 | "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x))); |
||
| 318 | __IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2"); |
||
| 319 | |||
| 320 | __IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2", |
||
| 321 | (float __x, int __layer, float __dPdx, float __dPdy), |
||
| 322 | "tex.grad.a1d.v4", "f32", |
||
| 323 | "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};", |
||
| 324 | ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy))); |
||
| 325 | __IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2"); |
||
| 326 | |||
| 327 | __IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2", |
||
| 328 | (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32", |
||
| 329 | "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;", |
||
| 330 | ("r"(__layer), "f"(__x), "f"(__level))); |
||
| 331 | __IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2"); |
||
| 332 | |||
| 333 | __IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level), |
||
| 334 | "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;", |
||
| 335 | ("f"(__x), "f"(__level))); |
||
| 336 | __IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2"); |
||
| 337 | |||
| 338 | // 2D |
||
| 339 | __IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4", |
||
| 340 | "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); |
||
| 341 | __IMPL_ALIAS("__itex2D", "__tex2D_v2"); |
||
| 342 | |||
| 343 | __IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir), |
||
| 344 | "{.reg .pred %%p0;\n\t" |
||
| 345 | "tex.2d.v4", |
||
| 346 | "f32", |
||
| 347 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" |
||
| 348 | " selp.u16 %4, 1, 0, %%p0; }", |
||
| 349 | ("f"(__x), "f"(__y))); |
||
| 350 | |||
| 351 | __IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2", |
||
| 352 | (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy), |
||
| 353 | "tex.grad.2d.v4", "f32", |
||
| 354 | "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};", |
||
| 355 | ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x), |
||
| 356 | "f"(__dPdy->y))); |
||
| 357 | __IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2"); |
||
| 358 | |||
| 359 | __IMPL_S3S("__itex2DGrad_sparse", |
||
| 360 | (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy, |
||
| 361 | unsigned char *__ir), |
||
| 362 | "{.reg .pred %%p0;\n\t" |
||
| 363 | "tex.grad.2d.v4", |
||
| 364 | "f32", |
||
| 365 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t" |
||
| 366 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 367 | ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x), |
||
| 368 | "f"(__dPdy->y))); |
||
| 369 | |||
| 370 | __IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2", |
||
| 371 | (float __x, float __y, int __layer), "tex.a2d.v4", "f32", |
||
| 372 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", |
||
| 373 | ("r"(__layer), "f"(__x), "f"(__y))); |
||
| 374 | __IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2"); |
||
| 375 | |||
| 376 | __IMPL_S3S("__itex2DLayered_sparse", |
||
| 377 | (float __x, float __y, int __layer, unsigned char *__ir), |
||
| 378 | "{.reg .pred %%p0;\n\t" |
||
| 379 | "tex.a2d.v4", |
||
| 380 | "f32", |
||
| 381 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" |
||
| 382 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 383 | ("r"(__layer), "f"(__x), "f"(__y))); |
||
| 384 | |||
| 385 | __IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2", |
||
| 386 | (float __x, float __y, int __layer, const float2 *__dPdx, |
||
| 387 | const float2 *__dPdy), |
||
| 388 | "tex.grad.a2d.v4", "f32", |
||
| 389 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};", |
||
| 390 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), |
||
| 391 | "f"(__dPdy->x), "f"(__dPdy->y))); |
||
| 392 | __IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2"); |
||
| 393 | |||
| 394 | __IMPL_S3S( |
||
| 395 | "__itex2DLayeredGrad_sparse", |
||
| 396 | (float __x, float __y, int __layer, const float2 *__dPdx, |
||
| 397 | const float2 *__dPdy, unsigned char *__ir), |
||
| 398 | "{.reg .pred %%p0;\n\t" |
||
| 399 | "tex.grad.a2d.v4", |
||
| 400 | "f32", |
||
| 401 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t" |
||
| 402 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 403 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), |
||
| 404 | "f"(__dPdy->x), "f"(__dPdy->y))); |
||
| 405 | |||
| 406 | __IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2", |
||
| 407 | (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4", |
||
| 408 | "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", |
||
| 409 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__level))); |
||
| 410 | __IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2"); |
||
| 411 | |||
| 412 | __IMPL_S3S("__itex2DLayeredLod_sparse", |
||
| 413 | (float __x, float __y, int __layer, float __level, |
||
| 414 | unsigned char *__ir), |
||
| 415 | "{.reg .pred %%p0;\n\t" |
||
| 416 | "tex.level.a2d.v4", |
||
| 417 | "f32", |
||
| 418 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t" |
||
| 419 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 420 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__level))); |
||
| 421 | |||
| 422 | __IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2", |
||
| 423 | (float __x, float __y, float __level), "tex.level.2d.v4", "f32", |
||
| 424 | "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;", |
||
| 425 | ("f"(__x), "f"(__y), "f"(__level))); |
||
| 426 | __IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2"); |
||
| 427 | |||
| 428 | __IMPL_S3S("__itex2DLod_sparse", |
||
| 429 | (float __x, float __y, float __level, unsigned char *__ir), |
||
| 430 | "{.reg .pred %%p0;\n\t" |
||
| 431 | "tex.level.2d.v4", |
||
| 432 | "f32", |
||
| 433 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t" |
||
| 434 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 435 | ("f"(__x), "f"(__y), "f"(__level))); |
||
| 436 | |||
| 437 | // 2D gather is special. Unlike other variants that translate into exactly one |
||
| 438 | // asm instruction, it uses one of the four different instructions selected by |
||
| 439 | // __comp. We implement each instruction variant separately, and dispatch the |
||
| 440 | // right one from the manually implemented 'umbrella' fetch. |
||
| 441 | #define __IMPL_2DGATHER(variant, instr) \ |
||
| 442 | __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \ |
||
| 443 | __IDV("__tex2Dgather_rmnf_v2", variant), \ |
||
| 444 | (float __x, float __y, int __comp), instr, "f32", \ |
||
| 445 | "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \ |
||
| 446 | __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \ |
||
| 447 | __IDV("__tex2Dgather_v2", variant)); \ |
||
| 448 | __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \ |
||
| 449 | (float __x, float __y, unsigned char *__ir, int __comp), \ |
||
| 450 | "{.reg .pred %%p0;\n\t" instr, "f32", \ |
||
| 451 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \ |
||
| 452 | "selp.u16 %4, 1, 0, %%p0; }", \ |
||
| 453 | ("f"(__x), "f"(__y))); |
||
| 454 | __IMPL_2DGATHER(0, "tld4.r.2d.v4"); |
||
| 455 | __IMPL_2DGATHER(1, "tld4.g.2d.v4"); |
||
| 456 | __IMPL_2DGATHER(2, "tld4.b.2d.v4"); |
||
| 457 | __IMPL_2DGATHER(3, "tld4.a.2d.v4"); |
||
| 458 | |||
| 459 | // Umbrella dispatcher -- calls into specific 2Dgather variant. |
||
| 460 | template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> { |
||
| 461 | template <class __T> |
||
| 462 | __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y, |
||
| 463 | int __comp) { |
||
| 464 | switch (__comp) { |
||
| 465 | case 0: |
||
| 466 | return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>( |
||
| 467 | __obj, __x, __y, __comp); |
||
| 468 | case 1: |
||
| 469 | return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>( |
||
| 470 | __obj, __x, __y, __comp); |
||
| 471 | case 2: |
||
| 472 | return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>( |
||
| 473 | __obj, __x, __y, __comp); |
||
| 474 | case 3: |
||
| 475 | return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>( |
||
| 476 | __obj, __x, __y, __comp); |
||
| 477 | } |
||
| 478 | } |
||
| 479 | }; |
||
| 480 | __IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2"); |
||
| 481 | |||
| 482 | template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> { |
||
| 483 | template <class __T> |
||
| 484 | __device__ static float4 __run(cudaTextureObject_t __obj, float __x, |
||
| 485 | float __y, int __comp) { |
||
| 486 | switch (__comp) { |
||
| 487 | case 0: |
||
| 488 | return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>( |
||
| 489 | __obj, __x, __y, __comp); |
||
| 490 | case 1: |
||
| 491 | return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>( |
||
| 492 | __obj, __x, __y, __comp); |
||
| 493 | case 2: |
||
| 494 | return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>( |
||
| 495 | __obj, __x, __y, __comp); |
||
| 496 | case 3: |
||
| 497 | return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>( |
||
| 498 | __obj, __x, __y, __comp); |
||
| 499 | } |
||
| 500 | } |
||
| 501 | }; |
||
| 502 | |||
| 503 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) |
||
| 504 | template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> { |
||
| 505 | template <class __T> |
||
| 506 | __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y, |
||
| 507 | unsigned char *__ir, int __comp) { |
||
| 508 | switch (__comp) { |
||
| 509 | case 0: |
||
| 510 | return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>( |
||
| 511 | __obj, __x, __y, __ir, __comp); |
||
| 512 | case 1: |
||
| 513 | return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>( |
||
| 514 | __obj, __x, __y, __ir, __comp); |
||
| 515 | case 2: |
||
| 516 | return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>( |
||
| 517 | __obj, __x, __y, __ir, __comp); |
||
| 518 | case 3: |
||
| 519 | return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>( |
||
| 520 | __obj, __x, __y, __ir, __comp); |
||
| 521 | } |
||
| 522 | } |
||
| 523 | }; |
||
| 524 | #endif |
||
| 525 | |||
| 526 | // 3D |
||
| 527 | __IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z), |
||
| 528 | "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", |
||
| 529 | ("f"(__x), "f"(__y), "f"(__z))); |
||
| 530 | __IMPL_ALIAS("__itex3D", "__tex3D_v2"); |
||
| 531 | |||
| 532 | __IMPL_S3S("__itex3D_sparse", |
||
| 533 | (float __x, float __y, float __z, unsigned char *__ir), |
||
| 534 | "{.reg .pred %%p0;\n\t" |
||
| 535 | "tex.3d.v4", |
||
| 536 | "f32", |
||
| 537 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" |
||
| 538 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 539 | ("f"(__x), "f"(__y), "f"(__z))); |
||
| 540 | |||
| 541 | __IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2", |
||
| 542 | (float __x, float __y, float __z, const float4 *__dPdx, |
||
| 543 | const float4 *__dPdy), |
||
| 544 | "tex.grad.3d.v4", "f32", |
||
| 545 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], " |
||
| 546 | "{%8, %9, %10, %10}, {%11, %12, %13, %13};", |
||
| 547 | ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), |
||
| 548 | "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); |
||
| 549 | __IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2"); |
||
| 550 | |||
| 551 | __IMPL_S3S("__itex3DGrad_sparse", |
||
| 552 | (float __x, float __y, float __z, const float4 *__dPdx, |
||
| 553 | const float4 *__dPdy, unsigned char *__ir), |
||
| 554 | "{.reg .pred %%p0;\n\t" |
||
| 555 | "tex.grad.3d.v4", |
||
| 556 | "f32", |
||
| 557 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], " |
||
| 558 | "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t" |
||
| 559 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 560 | ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), |
||
| 561 | "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); |
||
| 562 | |||
| 563 | __IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2", |
||
| 564 | (float __x, float __y, float __z, float __level), "tex.level.3d.v4", |
||
| 565 | "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", |
||
| 566 | ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); |
||
| 567 | __IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2"); |
||
| 568 | |||
| 569 | __IMPL_S3S("__itex3DLod_sparse", |
||
| 570 | (float __x, float __y, float __z, float __level, |
||
| 571 | unsigned char *__ir), |
||
| 572 | "{.reg .pred %%p0;\n\t" |
||
| 573 | "tex.level.3d.v4", |
||
| 574 | "f32", |
||
| 575 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t" |
||
| 576 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 577 | ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); |
||
| 578 | |||
| 579 | // Cubemap |
||
| 580 | __IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2", |
||
| 581 | (float __x, float __y, float __z), "tex.cube.v4", "f32", |
||
| 582 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];", |
||
| 583 | ("f"(__x), "f"(__y), "f"(__z))); |
||
| 584 | __IMPL_ALIAS("__itexCubemap", "__texCubemap_v2"); |
||
| 585 | |||
| 586 | __IMPL_S3S("__itexCubemap_sparse", |
||
| 587 | (float __x, float __y, float __z, unsigned char *__ir), |
||
| 588 | "{.reg .pred %%p0;\n\t" |
||
| 589 | "tex.cube.v4", |
||
| 590 | "f32", |
||
| 591 | "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t" |
||
| 592 | "selp.u16 %4, 1, 0, %%p0; }", |
||
| 593 | ("f"(__x), "f"(__y), "f"(__z))); |
||
| 594 | |||
| 595 | __IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2", |
||
| 596 | (float __x, float __y, float __z, const float4 *__dPdx, |
||
| 597 | const float4 *__dPdy), |
||
| 598 | "tex.grad.cube.v4", "f32", |
||
| 599 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], " |
||
| 600 | "{%8, %9, %10, %10}, {%11, %12, %13, %13};", |
||
| 601 | ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y), |
||
| 602 | "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z))); |
||
| 603 | __IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2"); |
||
| 604 | |||
| 605 | __IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2", |
||
| 606 | (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32", |
||
| 607 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];", |
||
| 608 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__z))); |
||
| 609 | __IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2"); |
||
| 610 | |||
| 611 | __IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2", |
||
| 612 | (float __x, float __y, float __z, int __layer, const float4 *__dPdx, |
||
| 613 | const float4 *__dPdy), |
||
| 614 | "tex.grad.acube.v4", "f32", |
||
| 615 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], " |
||
| 616 | "{%9, %10, %11, %11}, {%12, %13, %14, %14};", |
||
| 617 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), |
||
| 618 | "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), |
||
| 619 | "f"(__dPdy->z))); |
||
| 620 | __IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2"); |
||
| 621 | |||
| 622 | __IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2", |
||
| 623 | (float __x, float __y, float __z, int __layer, float __level), |
||
| 624 | "tex.level.acube.v4", "f32", |
||
| 625 | "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;", |
||
| 626 | ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level))); |
||
| 627 | __IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2"); |
||
| 628 | |||
| 629 | __IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2", |
||
| 630 | (float __x, float __y, float __z, float __level), "tex.level.cube.v4", |
||
| 631 | "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;", |
||
| 632 | ("f"(__x), "f"(__y), "f"(__z), "f"(__level))); |
||
| 633 | __IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2"); |
||
| 634 | |||
| 635 | // Helper class for extracting slice of data from V4 fetch results. |
||
| 636 | template <class __DestT, class __SrcT> struct __convert { |
||
| 637 | template <int __NElements = sizeof(__DestT) / |
||
| 638 | sizeof(typename __TypeInfoT<__DestT>::__base_t)> |
||
| 639 | __device__ static __DestT __run(__SrcT __v); |
||
| 640 | template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; } |
||
| 641 | template <> __device__ static __DestT __run<2>(__SrcT __v) { |
||
| 642 | return {__v.x, __v.y}; |
||
| 643 | } |
||
| 644 | template <> __device__ static __DestT __run<3>(__SrcT __v) { |
||
| 645 | return {__v.x, __v.y, __v.z}; |
||
| 646 | } |
||
| 647 | template <> __device__ static __DestT __run<4>(__SrcT __v) { |
||
| 648 | return {__v.x, __v.y, __v.z, __v.w}; |
||
| 649 | } |
||
| 650 | }; |
||
| 651 | |||
| 652 | // These are the top-level function overloads the __nv_tex_surf_handler expands |
||
| 653 | // to. Each overload deals with one of the several ways __nv_tex_surf_handler |
||
| 654 | // is called by CUDA headers. In the end, each of the overloads does the same |
||
| 655 | // job -- it figures out which `__tex_fetch_v4::run` variant should be used to |
||
| 656 | // fetch texture data and which `__convert::run` is needed to convert it into |
||
| 657 | // appropriate return type. |
||
| 658 | |||
| 659 | // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...); |
||
| 660 | // Data type and return type are based on ret. |
||
| 661 | template <class __op, class __T, class... __Args> |
||
| 662 | __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle, |
||
| 663 | __Args... __args) { |
||
| 664 | using __FetchT = typename __TypeInfoT<__T>::__fetch_t; |
||
| 665 | *__ptr = __convert<__T, __FetchT>::__run( |
||
| 666 | __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...)); |
||
| 667 | } |
||
| 668 | |||
| 669 | #if CUDA_VERSION < 12000 |
||
| 670 | // texture<> objects get magically converted into a texture reference. However, |
||
| 671 | // there's no way to convert them to cudaTextureObject_t on C++ level. So, we |
||
| 672 | // cheat a bit and use inline assembly to do it. It costs us an extra register |
||
| 673 | // and a move, but that is easy for ptxas to optimize away. |
||
| 674 | template <class __T> |
||
| 675 | __device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) { |
||
| 676 | cudaTextureObject_t __obj; |
||
| 677 | asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle)); |
||
| 678 | return __obj; |
||
| 679 | } |
||
| 680 | |||
| 681 | // __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...); |
||
| 682 | // Data type and return type is based on ret. |
||
| 683 | template <class __op, class __T, class __HandleT, class... __Args> |
||
| 684 | __device__ static void __tex_fetch(__T *__ptr, __HandleT __handle, |
||
| 685 | __Args... __args) { |
||
| 686 | using __FetchT = typename __TypeInfoT<__T>::__fetch_t; |
||
| 687 | *__ptr = __convert<__T, __FetchT>::__run( |
||
| 688 | __tex_fetch_v4<__op>::template __run<__FetchT>( |
||
| 689 | __tex_handle_to_obj(__handle), __args...)); |
||
| 690 | } |
||
| 691 | |||
| 692 | // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...); |
||
| 693 | // cudaReadModeNormalizedFloat fetches always return float4. |
||
| 694 | template <class __op, class __DataT, class __RetT, int __TexT, class... __Args> |
||
| 695 | __device__ static void |
||
| 696 | __tex_fetch(__DataT *, __RetT *__ptr, |
||
| 697 | texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle, |
||
| 698 | __Args... __args) { |
||
| 699 | using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t; |
||
| 700 | *__ptr = __convert<__RetT, float4>::__run( |
||
| 701 | __tex_fetch_v4<__op>::template __run<__FetchT>( |
||
| 702 | __tex_handle_to_obj(__handle), __args...)); |
||
| 703 | } |
||
| 704 | |||
| 705 | // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...); |
||
| 706 | // For cudaReadModeElementType fetch return type is based on type_dummy. |
||
| 707 | template <class __op, class __DataT, class __RetT, int __TexT, class... __Args> |
||
| 708 | __device__ static void |
||
| 709 | __tex_fetch(__DataT *, __RetT *__ptr, |
||
| 710 | texture<__DataT, __TexT, cudaReadModeElementType> __handle, |
||
| 711 | __Args... __args) { |
||
| 712 | using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t; |
||
| 713 | *__ptr = __convert<__RetT, __FetchT>::__run( |
||
| 714 | __tex_fetch_v4<__op>::template __run<__FetchT>( |
||
| 715 | __tex_handle_to_obj(__handle), __args...)); |
||
| 716 | } |
||
| 717 | #endif // CUDA_VERSION |
||
| 718 | } // namespace __cuda_tex |
||
| 719 | } // namespace |
||
| 720 | #pragma pop_macro("__ASM_OUT") |
||
| 721 | #pragma pop_macro("__ASM_OUTP") |
||
| 722 | #pragma pop_macro("__Args") |
||
| 723 | #pragma pop_macro("__ID") |
||
| 724 | #pragma pop_macro("__IDV") |
||
| 725 | #pragma pop_macro("__IMPL_2DGATHER") |
||
| 726 | #pragma pop_macro("__IMPL_ALIAS") |
||
| 727 | #pragma pop_macro("__IMPL_ALIASI") |
||
| 728 | #pragma pop_macro("__IMPL_F1") |
||
| 729 | #pragma pop_macro("__IMPL_F3") |
||
| 730 | #pragma pop_macro("__IMPL_F3N") |
||
| 731 | #pragma pop_macro("__IMPL_F3S") |
||
| 732 | #pragma pop_macro("__IMPL_S") |
||
| 733 | #pragma pop_macro("__IMPL_S3") |
||
| 734 | #pragma pop_macro("__IMPL_S3I") |
||
| 735 | #pragma pop_macro("__IMPL_S3N") |
||
| 736 | #pragma pop_macro("__IMPL_S3NI") |
||
| 737 | #pragma pop_macro("__IMPL_S3S") |
||
| 738 | #pragma pop_macro("__IMPL_S3SI") |
||
| 739 | #pragma pop_macro("__IMPL_SI") |
||
| 740 | #pragma pop_macro("__L") |
||
| 741 | #pragma pop_macro("__STRIP_PARENS") |
||
| 742 | #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__ |