Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 *
9
 * This header provides in-header implmentations for NVCC's built-in
10
 * __nv_tex_surf_handler() which is used by CUDA's texture-related headers.  The
11
 * built-in is unusual as it's actually a set of function overloads that use the
12
 * first string literal argument as one of the overload parameters.
13
 */
14
#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
15
#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
16
#ifndef __CUDA__
17
#error "This file is for CUDA compilation only."
18
#endif
19
 
20
// __nv_tex_surf_handler() provided by this header as a macro.
21
#define __nv_tex_surf_handler(__op, __ptr, ...)                                \
22
  ::__cuda_tex::__tex_fetch<                                                   \
23
      ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr,           \
24
                                                              __VA_ARGS__)
25
 
26
#pragma push_macro("__ASM_OUT")
27
#pragma push_macro("__ASM_OUTP")
28
#pragma push_macro("__Args")
29
#pragma push_macro("__ID")
30
#pragma push_macro("__IDV")
31
#pragma push_macro("__IMPL_2DGATHER")
32
#pragma push_macro("__IMPL_ALIAS")
33
#pragma push_macro("__IMPL_ALIASI")
34
#pragma push_macro("__IMPL_F1")
35
#pragma push_macro("__IMPL_F3")
36
#pragma push_macro("__IMPL_F3N")
37
#pragma push_macro("__IMPL_F3S")
38
#pragma push_macro("__IMPL_S")
39
#pragma push_macro("__IMPL_S3")
40
#pragma push_macro("__IMPL_S3I")
41
#pragma push_macro("__IMPL_S3N")
42
#pragma push_macro("__IMPL_S3NI")
43
#pragma push_macro("__IMPL_S3S")
44
#pragma push_macro("__IMPL_S3SI")
45
#pragma push_macro("__IMPL_SI")
46
#pragma push_macro("__L")
47
#pragma push_macro("__STRIP_PARENS")
48
 
49
// Put all functions into anonymous namespace so they have internal linkage.
50
// The device-only function here must be internal in order to avoid ODR
51
// violations in case they are used from the files compiled with
52
// -fgpu-rdc. E.g. a library and an app using it may be built with a different
53
// version of this header file.
54
namespace {
55
 
56
// Put the implmentation into its own namespace so we don't pollute the TU.
57
namespace __cuda_tex {
58
 
59
// First, we need a perfect hash function and a few constexpr helper functions
60
// for converting a string literal into a numeric value which can be used to
61
// parametrize a template. We can not use string literals for that as that would
62
// require C++20.
63
//
64
// The hash function was generated with 'gperf' and then manually converted into
65
// its constexpr equivalent.
66
//
67
// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
68
// function has a collision for any of the texture operations, the compilation
69
// will fail due to an attempt to redefine a tag with the same value. If the
70
// header compiles, then the hash function is good enough for the job.
71
 
72
constexpr int __tex_len(const char *s) {
73
  return (s[0] == 0)    ? 0
74
         : (s[1] == 0)  ? 1
75
         : (s[2] == 0)  ? 2
76
         : (s[3] == 0)  ? 3
77
         : (s[4] == 0)  ? 4
78
         : (s[5] == 0)  ? 5
79
         : (s[6] == 0)  ? 6
80
         : (s[7] == 0)  ? 7
81
         : (s[8] == 0)  ? 8
82
         : (s[9] == 0)  ? 9
83
         : (s[10] == 0) ? 10
84
         : (s[11] == 0) ? 11
85
         : (s[12] == 0) ? 12
86
         : (s[13] == 0) ? 13
87
         : (s[14] == 0) ? 14
88
         : (s[15] == 0) ? 15
89
         : (s[16] == 0) ? 16
90
         : (s[17] == 0) ? 17
91
         : (s[18] == 0) ? 18
92
         : (s[19] == 0) ? 19
93
         : (s[20] == 0) ? 20
94
         : (s[21] == 0) ? 21
95
         : (s[22] == 0) ? 22
96
         : (s[23] == 0) ? 23
97
         : (s[24] == 0) ? 24
98
         : (s[25] == 0) ? 25
99
         : (s[26] == 0) ? 26
100
         : (s[27] == 0) ? 27
101
         : (s[28] == 0) ? 28
102
         : (s[29] == 0) ? 29
103
         : (s[30] == 0) ? 30
104
         : (s[31] == 0) ? 31
105
                        : 32;
106
}
107
 
108
constexpr int __tex_hash_map(int c) {
109
  return (c == 49)    ? 10
110
         : (c == 50)  ? 0
111
         : (c == 51)  ? 100
112
         : (c == 52)  ? 30
113
         : (c == 67)  ? 10
114
         : (c == 68)  ? 0
115
         : (c == 69)  ? 25
116
         : (c == 72)  ? 70
117
         : (c == 77)  ? 0
118
         : (c == 96)  ? 44
119
         : (c == 99)  ? 10
120
         : (c == 100) ? 5
121
         : (c == 101) ? 60
122
         : (c == 102) ? 40
123
         : (c == 103) ? 70
124
         : (c == 104) ? 25
125
         : (c == 112) ? 0
126
         : (c == 114) ? 45
127
         : (c == 117) ? 5
128
         : (c == 118) ? 85
129
         : (c == 120) ? 20
130
                      : 225;
131
}
132
 
133
constexpr int __tex_op_hash(const char *str) {
134
  return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
135
         __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
136
}
137
 
138
// Tag type to identify particular texture operation.
139
template <int N> struct __Tag;
140
#define __ID(__op) __Tag<__tex_op_hash(__op)>
141
// Tags for variants of particular operation. E.g. tex2Dgather can translate
142
// into 4 different instructions.
143
#define __IDV(__op, __variant)                                                 \
144
  __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
145
 
146
// Helper classes for figuring out key data types for derived types.
147
// E.g. char2 has __base_t = char, __fetch_t = char4
148
template <class> struct __TypeInfoT;
149
// Type info for the fundamental types.
150
template <> struct __TypeInfoT<float> {
151
  using __base_t = float;
152
  using __fetch_t = float4;
153
};
154
template <> struct __TypeInfoT<char> {
155
  using __base_t = char;
156
  using __fetch_t = int4;
157
};
158
template <> struct __TypeInfoT<signed char> {
159
  using __base_t = signed char;
160
  using __fetch_t = int4;
161
};
162
template <> struct __TypeInfoT<unsigned char> {
163
  using __base_t = unsigned char;
164
  using __fetch_t = uint4;
165
};
166
template <> struct __TypeInfoT<short> {
167
  using __base_t = short;
168
  using __fetch_t = int4;
169
};
170
template <> struct __TypeInfoT<unsigned short> {
171
  using __base_t = unsigned short;
172
  using __fetch_t = uint4;
173
};
174
template <> struct __TypeInfoT<int> {
175
  using __base_t = int;
176
  using __fetch_t = int4;
177
};
178
template <> struct __TypeInfoT<unsigned int> {
179
  using __base_t = unsigned int;
180
  using __fetch_t = uint4;
181
};
182
 
183
// Derived base/fetch types for N-element vectors.
184
template <class __T> struct __TypeInfoT {
185
  using __base_t = decltype(__T::x);
186
  using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
187
};
188
 
189
// Classes that implement specific texture ops.
190
template <class __op> struct __tex_fetch_v4;
191
 
192
// Helper macros to strip parens from a macro argument.
193
#define __Args(...) __VA_ARGS__
194
#define __STRIP_PARENS(__X) __X
195
#define __L(__X) __STRIP_PARENS(__Args __X)
196
 
197
// Construct inline assembly output args.
198
// Results are stored in a temp var __r.
199
// isResident bool is pointed to by __ir
200
// Asm args for return values. It's a 4-element vector
201
#define __ASM_OUT(__t)                                                         \
202
  ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
203
// .. possibly combined with a predicate.
204
#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
205
 
206
// Implements a single variant of texture fetch instruction.
207
#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args)        \
208
  template <>                                                                  \
209
  __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) {        \
210
    __rt __r;                                                                  \
211
    asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args));             \
212
    return __r;                                                                \
213
  }
214
 
215
// Implements texture fetch instructions for int4/uint4/float4 data types.
216
#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)        \
217
  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
218
            __ASM_OUT("r"), __asm_args)                                        \
219
  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
220
            __ASM_OUT("r"), __asm_args)                                        \
221
  __IMPL_F1(float4, float4, __args,                                            \
222
            __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"),       \
223
            __asm_args)
224
// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
225
// types. Similar to above, but returns a boolean 'isPresent' value in addition
226
// to texture data,
227
#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
228
  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
229
            __ASM_OUTP("r"), __asm_args)                                       \
230
  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
231
            __ASM_OUTP("r"), __asm_args)                                       \
232
  __IMPL_F1(float4, float4, __args,                                            \
233
            __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"),      \
234
            __asm_args)
235
 
236
// Similar to F3, but for integer data which is returned as normalized floats.
237
// Only instantiates fetch functions for int4/uint4.
238
#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
239
  __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
240
            __ASM_OUT("r"), __asm_args)                                        \
241
  __IMPL_F1(float4, uint4, __args,                                             \
242
            __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"),       \
243
            __asm_args)
244
 
245
// Instantiates __tex_fetch_v4 with regular fetch functions.
246
#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
247
  template <> struct __tex_fetch_v4<__op> {                                    \
248
    template <class T>                                                         \
249
    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
250
    __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)            \
251
  }
252
 
253
// Same, but for sparse ops. Only available on sm_60+
254
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
255
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
256
                    __asm_args)                                                \
257
  template <> struct __tex_fetch_v4<__op> {                                    \
258
    template <class T>                                                         \
259
    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
260
    __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
261
  }
262
#else
263
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
264
#endif
265
 
266
// Same, but for normalized float ops.
267
#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
268
                    __asm_args)                                                \
269
  template <> struct __tex_fetch_v4<__op> {                                    \
270
    template <class T>                                                         \
271
    __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args));    \
272
    __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
273
  }
274
 
275
// Regular and normalized float ops share a lot of similarities.  This macro
276
// instantiates both variants -- normal for __op and normalized for __opn.
277
#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,       \
278
                  __asm_args)                                                  \
279
  __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args);      \
280
  __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
281
 
282
// Convenience macros which converts string literal __op into a __Tag,
283
#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)  \
284
  __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
285
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
286
  __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
287
#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
288
  __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
289
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,        \
290
                 __asm_args)                                                   \
291
  __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
292
            __asm_args)
293
 
294
// CUDA headers have some 'legacy' texture oprerations that duplicate
295
// functionality. So, we just inherit it, instead of refining a copy.
296
#define __IMPL_ALIASI(__op, __opn)                                             \
297
  template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
298
#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
299
 
300
// Now we can instantiate everything we need for each specific texture fetch
301
// variant.
302
__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
303
         "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
304
__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
305
         "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
306
__IMPL_ALIAS("__itex1D", "__tex1D_v2");
307
__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
308
 
309
__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
310
         (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
311
         "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
312
         ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
313
__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
314
 
315
__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
316
         (float __x, int __layer), "tex.a1d.v4", "f32",
317
         "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
318
__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
319
 
320
__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
321
         (float __x, int __layer, float __dPdx, float __dPdy),
322
         "tex.grad.a1d.v4", "f32",
323
         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
324
         ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
325
__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
326
 
327
__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
328
         (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
329
         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
330
         ("r"(__layer), "f"(__x), "f"(__level)));
331
__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
332
 
333
__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
334
         "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
335
         ("f"(__x), "f"(__level)));
336
__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
337
 
338
// 2D
339
__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
340
         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
341
__IMPL_ALIAS("__itex2D", "__tex2D_v2");
342
 
343
__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
344
           "{.reg .pred %%p0;\n\t"
345
           "tex.2d.v4",
346
           "f32",
347
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
348
           " selp.u16 %4, 1, 0, %%p0; }",
349
           ("f"(__x), "f"(__y)));
350
 
351
__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
352
         (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
353
         "tex.grad.2d.v4", "f32",
354
         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
355
         ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
356
          "f"(__dPdy->y)));
357
__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
358
 
359
__IMPL_S3S("__itex2DGrad_sparse",
360
           (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
361
            unsigned char *__ir),
362
           "{.reg .pred %%p0;\n\t"
363
           "tex.grad.2d.v4",
364
           "f32",
365
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
366
           "selp.u16 %4, 1, 0, %%p0; }",
367
           ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
368
            "f"(__dPdy->y)));
369
 
370
__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
371
         (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
372
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
373
         ("r"(__layer), "f"(__x), "f"(__y)));
374
__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
375
 
376
__IMPL_S3S("__itex2DLayered_sparse",
377
           (float __x, float __y, int __layer, unsigned char *__ir),
378
           "{.reg .pred %%p0;\n\t"
379
           "tex.a2d.v4",
380
           "f32",
381
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
382
           "selp.u16 %4, 1, 0, %%p0; }",
383
           ("r"(__layer), "f"(__x), "f"(__y)));
384
 
385
__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
386
         (float __x, float __y, int __layer, const float2 *__dPdx,
387
          const float2 *__dPdy),
388
         "tex.grad.a2d.v4", "f32",
389
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
390
         ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
391
          "f"(__dPdy->x), "f"(__dPdy->y)));
392
__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
393
 
394
__IMPL_S3S(
395
    "__itex2DLayeredGrad_sparse",
396
    (float __x, float __y, int __layer, const float2 *__dPdx,
397
     const float2 *__dPdy, unsigned char *__ir),
398
    "{.reg .pred %%p0;\n\t"
399
    "tex.grad.a2d.v4",
400
    "f32",
401
    "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
402
    "selp.u16 %4, 1, 0, %%p0; }",
403
    ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
404
     "f"(__dPdy->x), "f"(__dPdy->y)));
405
 
406
__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
407
         (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
408
         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
409
         ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
410
__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
411
 
412
__IMPL_S3S("__itex2DLayeredLod_sparse",
413
           (float __x, float __y, int __layer, float __level,
414
            unsigned char *__ir),
415
           "{.reg .pred %%p0;\n\t"
416
           "tex.level.a2d.v4",
417
           "f32",
418
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
419
           "selp.u16 %4, 1, 0, %%p0; }",
420
           ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
421
 
422
__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
423
         (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
424
         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
425
         ("f"(__x), "f"(__y), "f"(__level)));
426
__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
427
 
428
__IMPL_S3S("__itex2DLod_sparse",
429
           (float __x, float __y, float __level, unsigned char *__ir),
430
           "{.reg .pred %%p0;\n\t"
431
           "tex.level.2d.v4",
432
           "f32",
433
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
434
           "selp.u16 %4, 1, 0, %%p0; }",
435
           ("f"(__x), "f"(__y), "f"(__level)));
436
 
437
// 2D gather is special. Unlike other variants that translate into exactly one
438
// asm instruction, it uses one of the four different instructions selected by
439
// __comp.  We implement each instruction variant separately, and dispatch the
440
// right one from the manually implemented 'umbrella' fetch.
441
#define __IMPL_2DGATHER(variant, instr)                                        \
442
  __IMPL_SI(__IDV("__tex2Dgather_v2", variant),                                \
443
            __IDV("__tex2Dgather_rmnf_v2", variant),                           \
444
            (float __x, float __y, int __comp), instr, "f32",                  \
445
            "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));        \
446
  __IMPL_ALIASI(__IDV("__itex2Dgather", variant),                              \
447
                __IDV("__tex2Dgather_v2", variant));                           \
448
  __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant),                         \
449
              (float __x, float __y, unsigned char *__ir, int __comp),         \
450
              "{.reg .pred %%p0;\n\t" instr, "f32",                            \
451
              "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"                     \
452
              "selp.u16 %4, 1, 0, %%p0; }",                                    \
453
              ("f"(__x), "f"(__y)));
454
__IMPL_2DGATHER(0, "tld4.r.2d.v4");
455
__IMPL_2DGATHER(1, "tld4.g.2d.v4");
456
__IMPL_2DGATHER(2, "tld4.b.2d.v4");
457
__IMPL_2DGATHER(3, "tld4.a.2d.v4");
458
 
459
// Umbrella dispatcher -- calls into specific 2Dgather variant.
460
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
461
  template <class __T>
462
  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
463
                              int __comp) {
464
    switch (__comp) {
465
    case 0:
466
      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
467
          __obj, __x, __y, __comp);
468
    case 1:
469
      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
470
          __obj, __x, __y, __comp);
471
    case 2:
472
      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
473
          __obj, __x, __y, __comp);
474
    case 3:
475
      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
476
          __obj, __x, __y, __comp);
477
    }
478
  }
479
};
480
__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
481
 
482
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
483
  template <class __T>
484
  __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
485
                                 float __y, int __comp) {
486
    switch (__comp) {
487
    case 0:
488
      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
489
          __obj, __x, __y, __comp);
490
    case 1:
491
      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
492
          __obj, __x, __y, __comp);
493
    case 2:
494
      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
495
          __obj, __x, __y, __comp);
496
    case 3:
497
      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
498
          __obj, __x, __y, __comp);
499
    }
500
  }
501
};
502
 
503
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
504
template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
505
  template <class __T>
506
  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
507
                              unsigned char *__ir, int __comp) {
508
    switch (__comp) {
509
    case 0:
510
      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
511
          __obj, __x, __y, __ir, __comp);
512
    case 1:
513
      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
514
          __obj, __x, __y, __ir, __comp);
515
    case 2:
516
      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
517
          __obj, __x, __y, __ir, __comp);
518
    case 3:
519
      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
520
          __obj, __x, __y, __ir, __comp);
521
    }
522
  }
523
};
524
#endif
525
 
526
// 3D
527
__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
528
         "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
529
         ("f"(__x), "f"(__y), "f"(__z)));
530
__IMPL_ALIAS("__itex3D", "__tex3D_v2");
531
 
532
__IMPL_S3S("__itex3D_sparse",
533
           (float __x, float __y, float __z, unsigned char *__ir),
534
           "{.reg .pred %%p0;\n\t"
535
           "tex.3d.v4",
536
           "f32",
537
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
538
           "selp.u16 %4, 1, 0, %%p0; }",
539
           ("f"(__x), "f"(__y), "f"(__z)));
540
 
541
__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
542
         (float __x, float __y, float __z, const float4 *__dPdx,
543
          const float4 *__dPdy),
544
         "tex.grad.3d.v4", "f32",
545
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
546
         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
547
         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
548
          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
549
__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
550
 
551
__IMPL_S3S("__itex3DGrad_sparse",
552
           (float __x, float __y, float __z, const float4 *__dPdx,
553
            const float4 *__dPdy, unsigned char *__ir),
554
           "{.reg .pred %%p0;\n\t"
555
           "tex.grad.3d.v4",
556
           "f32",
557
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
558
           "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
559
           "selp.u16 %4, 1, 0, %%p0; }",
560
           ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
561
            "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
562
 
563
__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
564
         (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
565
         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
566
         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
567
__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
568
 
569
__IMPL_S3S("__itex3DLod_sparse",
570
           (float __x, float __y, float __z, float __level,
571
            unsigned char *__ir),
572
           "{.reg .pred %%p0;\n\t"
573
           "tex.level.3d.v4",
574
           "f32",
575
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
576
           "selp.u16 %4, 1, 0, %%p0; }",
577
           ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
578
 
579
// Cubemap
580
__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
581
         (float __x, float __y, float __z), "tex.cube.v4", "f32",
582
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
583
         ("f"(__x), "f"(__y), "f"(__z)));
584
__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
585
 
586
__IMPL_S3S("__itexCubemap_sparse",
587
           (float __x, float __y, float __z, unsigned char *__ir),
588
           "{.reg .pred %%p0;\n\t"
589
           "tex.cube.v4",
590
           "f32",
591
           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
592
           "selp.u16 %4, 1, 0, %%p0; }",
593
           ("f"(__x), "f"(__y), "f"(__z)));
594
 
595
__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
596
         (float __x, float __y, float __z, const float4 *__dPdx,
597
          const float4 *__dPdy),
598
         "tex.grad.cube.v4", "f32",
599
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
600
         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
601
         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
602
          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
603
__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
604
 
605
__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
606
         (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
607
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
608
         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
609
__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
610
 
611
__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
612
         (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
613
          const float4 *__dPdy),
614
         "tex.grad.acube.v4", "f32",
615
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
616
         "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
617
         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
618
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
619
          "f"(__dPdy->z)));
620
__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
621
 
622
__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
623
         (float __x, float __y, float __z, int __layer, float __level),
624
         "tex.level.acube.v4", "f32",
625
         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
626
         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
627
__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
628
 
629
__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
630
         (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
631
         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
632
         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
633
__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
634
 
635
// Helper class for extracting slice of data from V4 fetch results.
636
template <class __DestT, class __SrcT> struct __convert {
637
  template <int __NElements = sizeof(__DestT) /
638
                              sizeof(typename __TypeInfoT<__DestT>::__base_t)>
639
  __device__ static __DestT __run(__SrcT __v);
640
  template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
641
  template <> __device__ static __DestT __run<2>(__SrcT __v) {
642
    return {__v.x, __v.y};
643
  }
644
  template <> __device__ static __DestT __run<3>(__SrcT __v) {
645
    return {__v.x, __v.y, __v.z};
646
  }
647
  template <> __device__ static __DestT __run<4>(__SrcT __v) {
648
    return {__v.x, __v.y, __v.z, __v.w};
649
  }
650
};
651
 
652
// These are the top-level function overloads the __nv_tex_surf_handler expands
653
// to.  Each overload deals with one of the several ways __nv_tex_surf_handler
654
// is called by CUDA headers. In the end, each of the overloads does the same
655
// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
656
// fetch texture data and which `__convert::run` is needed to convert it into
657
// appropriate return type.
658
 
659
// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
660
//   Data type and return type are based on ret.
661
template <class __op, class __T, class... __Args>
662
__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
663
                                   __Args... __args) {
664
  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
665
  *__ptr = __convert<__T, __FetchT>::__run(
666
      __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
667
}
668
 
669
#if CUDA_VERSION < 12000
670
// texture<> objects get magically converted into a texture reference.  However,
671
// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
672
// cheat a bit and use inline assembly to do it. It costs us an extra register
673
// and a move, but that is easy for ptxas to optimize away.
674
template <class __T>
675
__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
676
  cudaTextureObject_t __obj;
677
  asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
678
  return __obj;
679
}
680
 
681
// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
682
//   Data type and return type is based on ret.
683
template <class __op, class __T, class __HandleT, class... __Args>
684
__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
685
                                   __Args... __args) {
686
  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
687
  *__ptr = __convert<__T, __FetchT>::__run(
688
      __tex_fetch_v4<__op>::template __run<__FetchT>(
689
          __tex_handle_to_obj(__handle), __args...));
690
}
691
 
692
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
693
// cudaReadModeNormalizedFloat fetches always return float4.
694
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
695
__device__ static void
696
__tex_fetch(__DataT *, __RetT *__ptr,
697
            texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
698
            __Args... __args) {
699
  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
700
  *__ptr = __convert<__RetT, float4>::__run(
701
      __tex_fetch_v4<__op>::template __run<__FetchT>(
702
          __tex_handle_to_obj(__handle), __args...));
703
}
704
 
705
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
706
// For cudaReadModeElementType fetch return type is based on type_dummy.
707
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
708
__device__ static void
709
__tex_fetch(__DataT *, __RetT *__ptr,
710
            texture<__DataT, __TexT, cudaReadModeElementType> __handle,
711
            __Args... __args) {
712
  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
713
  *__ptr = __convert<__RetT, __FetchT>::__run(
714
      __tex_fetch_v4<__op>::template __run<__FetchT>(
715
          __tex_handle_to_obj(__handle), __args...));
716
}
717
#endif // CUDA_VERSION
718
} // namespace __cuda_tex
719
} // namespace
720
#pragma pop_macro("__ASM_OUT")
721
#pragma pop_macro("__ASM_OUTP")
722
#pragma pop_macro("__Args")
723
#pragma pop_macro("__ID")
724
#pragma pop_macro("__IDV")
725
#pragma pop_macro("__IMPL_2DGATHER")
726
#pragma pop_macro("__IMPL_ALIAS")
727
#pragma pop_macro("__IMPL_ALIASI")
728
#pragma pop_macro("__IMPL_F1")
729
#pragma pop_macro("__IMPL_F3")
730
#pragma pop_macro("__IMPL_F3N")
731
#pragma pop_macro("__IMPL_F3S")
732
#pragma pop_macro("__IMPL_S")
733
#pragma pop_macro("__IMPL_S3")
734
#pragma pop_macro("__IMPL_S3I")
735
#pragma pop_macro("__IMPL_S3N")
736
#pragma pop_macro("__IMPL_S3NI")
737
#pragma pop_macro("__IMPL_S3S")
738
#pragma pop_macro("__IMPL_S3SI")
739
#pragma pop_macro("__IMPL_SI")
740
#pragma pop_macro("__L")
741
#pragma pop_macro("__STRIP_PARENS")
742
#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__