Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  *
  9.  * This header provides in-header implmentations for NVCC's built-in
  10.  * __nv_tex_surf_handler() which is used by CUDA's texture-related headers.  The
  11.  * built-in is unusual as it's actually a set of function overloads that use the
  12.  * first string literal argument as one of the overload parameters.
  13.  */
  14. #ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
  15. #define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
  16. #ifndef __CUDA__
  17. #error "This file is for CUDA compilation only."
  18. #endif
  19.  
  20. // __nv_tex_surf_handler() provided by this header as a macro.
  21. #define __nv_tex_surf_handler(__op, __ptr, ...)                                \
  22.   ::__cuda_tex::__tex_fetch<                                                   \
  23.       ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr,           \
  24.                                                               __VA_ARGS__)
  25.  
  26. #pragma push_macro("__ASM_OUT")
  27. #pragma push_macro("__ASM_OUTP")
  28. #pragma push_macro("__Args")
  29. #pragma push_macro("__ID")
  30. #pragma push_macro("__IDV")
  31. #pragma push_macro("__IMPL_2DGATHER")
  32. #pragma push_macro("__IMPL_ALIAS")
  33. #pragma push_macro("__IMPL_ALIASI")
  34. #pragma push_macro("__IMPL_F1")
  35. #pragma push_macro("__IMPL_F3")
  36. #pragma push_macro("__IMPL_F3N")
  37. #pragma push_macro("__IMPL_F3S")
  38. #pragma push_macro("__IMPL_S")
  39. #pragma push_macro("__IMPL_S3")
  40. #pragma push_macro("__IMPL_S3I")
  41. #pragma push_macro("__IMPL_S3N")
  42. #pragma push_macro("__IMPL_S3NI")
  43. #pragma push_macro("__IMPL_S3S")
  44. #pragma push_macro("__IMPL_S3SI")
  45. #pragma push_macro("__IMPL_SI")
  46. #pragma push_macro("__L")
  47. #pragma push_macro("__STRIP_PARENS")
  48.  
  49. // Put all functions into anonymous namespace so they have internal linkage.
  50. // The device-only function here must be internal in order to avoid ODR
  51. // violations in case they are used from the files compiled with
  52. // -fgpu-rdc. E.g. a library and an app using it may be built with a different
  53. // version of this header file.
  54. namespace {
  55.  
  56. // Put the implmentation into its own namespace so we don't pollute the TU.
  57. namespace __cuda_tex {
  58.  
  59. // First, we need a perfect hash function and a few constexpr helper functions
  60. // for converting a string literal into a numeric value which can be used to
  61. // parametrize a template. We can not use string literals for that as that would
  62. // require C++20.
  63. //
  64. // The hash function was generated with 'gperf' and then manually converted into
  65. // its constexpr equivalent.
  66. //
  67. // NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
  68. // function has a collision for any of the texture operations, the compilation
  69. // will fail due to an attempt to redefine a tag with the same value. If the
  70. // header compiles, then the hash function is good enough for the job.
  71.  
  72. constexpr int __tex_len(const char *s) {
  73.   return (s[0] == 0)    ? 0
  74.          : (s[1] == 0)  ? 1
  75.          : (s[2] == 0)  ? 2
  76.          : (s[3] == 0)  ? 3
  77.          : (s[4] == 0)  ? 4
  78.          : (s[5] == 0)  ? 5
  79.          : (s[6] == 0)  ? 6
  80.          : (s[7] == 0)  ? 7
  81.          : (s[8] == 0)  ? 8
  82.          : (s[9] == 0)  ? 9
  83.          : (s[10] == 0) ? 10
  84.          : (s[11] == 0) ? 11
  85.          : (s[12] == 0) ? 12
  86.          : (s[13] == 0) ? 13
  87.          : (s[14] == 0) ? 14
  88.          : (s[15] == 0) ? 15
  89.          : (s[16] == 0) ? 16
  90.          : (s[17] == 0) ? 17
  91.          : (s[18] == 0) ? 18
  92.          : (s[19] == 0) ? 19
  93.          : (s[20] == 0) ? 20
  94.          : (s[21] == 0) ? 21
  95.          : (s[22] == 0) ? 22
  96.          : (s[23] == 0) ? 23
  97.          : (s[24] == 0) ? 24
  98.          : (s[25] == 0) ? 25
  99.          : (s[26] == 0) ? 26
  100.          : (s[27] == 0) ? 27
  101.          : (s[28] == 0) ? 28
  102.          : (s[29] == 0) ? 29
  103.          : (s[30] == 0) ? 30
  104.          : (s[31] == 0) ? 31
  105.                         : 32;
  106. }
  107.  
  108. constexpr int __tex_hash_map(int c) {
  109.   return (c == 49)    ? 10
  110.          : (c == 50)  ? 0
  111.          : (c == 51)  ? 100
  112.          : (c == 52)  ? 30
  113.          : (c == 67)  ? 10
  114.          : (c == 68)  ? 0
  115.          : (c == 69)  ? 25
  116.          : (c == 72)  ? 70
  117.          : (c == 77)  ? 0
  118.          : (c == 96)  ? 44
  119.          : (c == 99)  ? 10
  120.          : (c == 100) ? 5
  121.          : (c == 101) ? 60
  122.          : (c == 102) ? 40
  123.          : (c == 103) ? 70
  124.          : (c == 104) ? 25
  125.          : (c == 112) ? 0
  126.          : (c == 114) ? 45
  127.          : (c == 117) ? 5
  128.          : (c == 118) ? 85
  129.          : (c == 120) ? 20
  130.                       : 225;
  131. }
  132.  
  133. constexpr int __tex_op_hash(const char *str) {
  134.   return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
  135.          __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
  136. }
  137.  
  138. // Tag type to identify particular texture operation.
  139. template <int N> struct __Tag;
  140. #define __ID(__op) __Tag<__tex_op_hash(__op)>
  141. // Tags for variants of particular operation. E.g. tex2Dgather can translate
  142. // into 4 different instructions.
  143. #define __IDV(__op, __variant)                                                 \
  144.   __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
  145.  
  146. // Helper classes for figuring out key data types for derived types.
  147. // E.g. char2 has __base_t = char, __fetch_t = char4
  148. template <class> struct __TypeInfoT;
  149. // Type info for the fundamental types.
  150. template <> struct __TypeInfoT<float> {
  151.   using __base_t = float;
  152.   using __fetch_t = float4;
  153. };
  154. template <> struct __TypeInfoT<char> {
  155.   using __base_t = char;
  156.   using __fetch_t = int4;
  157. };
  158. template <> struct __TypeInfoT<signed char> {
  159.   using __base_t = signed char;
  160.   using __fetch_t = int4;
  161. };
  162. template <> struct __TypeInfoT<unsigned char> {
  163.   using __base_t = unsigned char;
  164.   using __fetch_t = uint4;
  165. };
  166. template <> struct __TypeInfoT<short> {
  167.   using __base_t = short;
  168.   using __fetch_t = int4;
  169. };
  170. template <> struct __TypeInfoT<unsigned short> {
  171.   using __base_t = unsigned short;
  172.   using __fetch_t = uint4;
  173. };
  174. template <> struct __TypeInfoT<int> {
  175.   using __base_t = int;
  176.   using __fetch_t = int4;
  177. };
  178. template <> struct __TypeInfoT<unsigned int> {
  179.   using __base_t = unsigned int;
  180.   using __fetch_t = uint4;
  181. };
  182.  
  183. // Derived base/fetch types for N-element vectors.
  184. template <class __T> struct __TypeInfoT {
  185.   using __base_t = decltype(__T::x);
  186.   using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
  187. };
  188.  
  189. // Classes that implement specific texture ops.
  190. template <class __op> struct __tex_fetch_v4;
  191.  
  192. // Helper macros to strip parens from a macro argument.
  193. #define __Args(...) __VA_ARGS__
  194. #define __STRIP_PARENS(__X) __X
  195. #define __L(__X) __STRIP_PARENS(__Args __X)
  196.  
  197. // Construct inline assembly output args.
  198. // Results are stored in a temp var __r.
  199. // isResident bool is pointed to by __ir
  200. // Asm args for return values. It's a 4-element vector
  201. #define __ASM_OUT(__t)                                                         \
  202.   ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
  203. // .. possibly combined with a predicate.
  204. #define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
  205.  
  206. // Implements a single variant of texture fetch instruction.
  207. #define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args)        \
  208.   template <>                                                                  \
  209.   __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) {        \
  210.     __rt __r;                                                                  \
  211.     asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args));             \
  212.     return __r;                                                                \
  213.   }
  214.  
  215. // Implements texture fetch instructions for int4/uint4/float4 data types.
  216. #define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)        \
  217.   __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
  218.             __ASM_OUT("r"), __asm_args)                                        \
  219.   __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
  220.             __ASM_OUT("r"), __asm_args)                                        \
  221.   __IMPL_F1(float4, float4, __args,                                            \
  222.             __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"),       \
  223.             __asm_args)
  224. // Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
  225. // types. Similar to above, but returns a boolean 'isPresent' value in addition
  226. // to texture data,
  227. #define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
  228.   __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
  229.             __ASM_OUTP("r"), __asm_args)                                       \
  230.   __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
  231.             __ASM_OUTP("r"), __asm_args)                                       \
  232.   __IMPL_F1(float4, float4, __args,                                            \
  233.             __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"),      \
  234.             __asm_args)
  235.  
  236. // Similar to F3, but for integer data which is returned as normalized floats.
  237. // Only instantiates fetch functions for int4/uint4.
  238. #define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
  239.   __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
  240.             __ASM_OUT("r"), __asm_args)                                        \
  241.   __IMPL_F1(float4, uint4, __args,                                             \
  242.             __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"),       \
  243.             __asm_args)
  244.  
  245. // Instantiates __tex_fetch_v4 with regular fetch functions.
  246. #define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
  247.   template <> struct __tex_fetch_v4<__op> {                                    \
  248.     template <class T>                                                         \
  249.     __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
  250.     __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)            \
  251.   }
  252.  
  253. // Same, but for sparse ops. Only available on sm_60+
  254. #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
  255. #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
  256.                     __asm_args)                                                \
  257.   template <> struct __tex_fetch_v4<__op> {                                    \
  258.     template <class T>                                                         \
  259.     __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
  260.     __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
  261.   }
  262. #else
  263. #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
  264. #endif
  265.  
  266. // Same, but for normalized float ops.
  267. #define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
  268.                     __asm_args)                                                \
  269.   template <> struct __tex_fetch_v4<__op> {                                    \
  270.     template <class T>                                                         \
  271.     __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args));    \
  272.     __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
  273.   }
  274.  
  275. // Regular and normalized float ops share a lot of similarities.  This macro
  276. // instantiates both variants -- normal for __op and normalized for __opn.
  277. #define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,       \
  278.                   __asm_args)                                                  \
  279.   __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args);      \
  280.   __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
  281.  
  282. // Convenience macros which converts string literal __op into a __Tag,
  283. #define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)  \
  284.   __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
  285. #define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
  286.   __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
  287. #define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
  288.   __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
  289. #define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,        \
  290.                  __asm_args)                                                   \
  291.   __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
  292.             __asm_args)
  293.  
  294. // CUDA headers have some 'legacy' texture oprerations that duplicate
  295. // functionality. So, we just inherit it, instead of refining a copy.
  296. #define __IMPL_ALIASI(__op, __opn)                                             \
  297.   template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
  298. #define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
  299.  
  300. // Now we can instantiate everything we need for each specific texture fetch
  301. // variant.
  302. __IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
  303.          "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
  304. __IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
  305.          "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
  306. __IMPL_ALIAS("__itex1D", "__tex1D_v2");
  307. __IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
  308.  
  309. __IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
  310.          (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
  311.          "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
  312.          ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
  313. __IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
  314.  
  315. __IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
  316.          (float __x, int __layer), "tex.a1d.v4", "f32",
  317.          "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
  318. __IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
  319.  
  320. __IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
  321.          (float __x, int __layer, float __dPdx, float __dPdy),
  322.          "tex.grad.a1d.v4", "f32",
  323.          "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
  324.          ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
  325. __IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
  326.  
  327. __IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
  328.          (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
  329.          "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
  330.          ("r"(__layer), "f"(__x), "f"(__level)));
  331. __IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
  332.  
  333. __IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
  334.          "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
  335.          ("f"(__x), "f"(__level)));
  336. __IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
  337.  
  338. // 2D
  339. __IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
  340.          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
  341. __IMPL_ALIAS("__itex2D", "__tex2D_v2");
  342.  
  343. __IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
  344.            "{.reg .pred %%p0;\n\t"
  345.            "tex.2d.v4",
  346.            "f32",
  347.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
  348.            " selp.u16 %4, 1, 0, %%p0; }",
  349.            ("f"(__x), "f"(__y)));
  350.  
  351. __IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
  352.          (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
  353.          "tex.grad.2d.v4", "f32",
  354.          "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
  355.          ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
  356.           "f"(__dPdy->y)));
  357. __IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
  358.  
  359. __IMPL_S3S("__itex2DGrad_sparse",
  360.            (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
  361.             unsigned char *__ir),
  362.            "{.reg .pred %%p0;\n\t"
  363.            "tex.grad.2d.v4",
  364.            "f32",
  365.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
  366.            "selp.u16 %4, 1, 0, %%p0; }",
  367.            ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
  368.             "f"(__dPdy->y)));
  369.  
  370. __IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
  371.          (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
  372.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
  373.          ("r"(__layer), "f"(__x), "f"(__y)));
  374. __IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
  375.  
  376. __IMPL_S3S("__itex2DLayered_sparse",
  377.            (float __x, float __y, int __layer, unsigned char *__ir),
  378.            "{.reg .pred %%p0;\n\t"
  379.            "tex.a2d.v4",
  380.            "f32",
  381.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
  382.            "selp.u16 %4, 1, 0, %%p0; }",
  383.            ("r"(__layer), "f"(__x), "f"(__y)));
  384.  
  385. __IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
  386.          (float __x, float __y, int __layer, const float2 *__dPdx,
  387.           const float2 *__dPdy),
  388.          "tex.grad.a2d.v4", "f32",
  389.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
  390.          ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
  391.           "f"(__dPdy->x), "f"(__dPdy->y)));
  392. __IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
  393.  
  394. __IMPL_S3S(
  395.     "__itex2DLayeredGrad_sparse",
  396.     (float __x, float __y, int __layer, const float2 *__dPdx,
  397.      const float2 *__dPdy, unsigned char *__ir),
  398.     "{.reg .pred %%p0;\n\t"
  399.     "tex.grad.a2d.v4",
  400.     "f32",
  401.     "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
  402.     "selp.u16 %4, 1, 0, %%p0; }",
  403.     ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
  404.      "f"(__dPdy->x), "f"(__dPdy->y)));
  405.  
  406. __IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
  407.          (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
  408.          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
  409.          ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
  410. __IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
  411.  
  412. __IMPL_S3S("__itex2DLayeredLod_sparse",
  413.            (float __x, float __y, int __layer, float __level,
  414.             unsigned char *__ir),
  415.            "{.reg .pred %%p0;\n\t"
  416.            "tex.level.a2d.v4",
  417.            "f32",
  418.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
  419.            "selp.u16 %4, 1, 0, %%p0; }",
  420.            ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
  421.  
  422. __IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
  423.          (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
  424.          "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
  425.          ("f"(__x), "f"(__y), "f"(__level)));
  426. __IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
  427.  
  428. __IMPL_S3S("__itex2DLod_sparse",
  429.            (float __x, float __y, float __level, unsigned char *__ir),
  430.            "{.reg .pred %%p0;\n\t"
  431.            "tex.level.2d.v4",
  432.            "f32",
  433.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
  434.            "selp.u16 %4, 1, 0, %%p0; }",
  435.            ("f"(__x), "f"(__y), "f"(__level)));
  436.  
  437. // 2D gather is special. Unlike other variants that translate into exactly one
  438. // asm instruction, it uses one of the four different instructions selected by
  439. // __comp.  We implement each instruction variant separately, and dispatch the
  440. // right one from the manually implemented 'umbrella' fetch.
  441. #define __IMPL_2DGATHER(variant, instr)                                        \
  442.   __IMPL_SI(__IDV("__tex2Dgather_v2", variant),                                \
  443.             __IDV("__tex2Dgather_rmnf_v2", variant),                           \
  444.             (float __x, float __y, int __comp), instr, "f32",                  \
  445.             "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));        \
  446.   __IMPL_ALIASI(__IDV("__itex2Dgather", variant),                              \
  447.                 __IDV("__tex2Dgather_v2", variant));                           \
  448.   __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant),                         \
  449.               (float __x, float __y, unsigned char *__ir, int __comp),         \
  450.               "{.reg .pred %%p0;\n\t" instr, "f32",                            \
  451.               "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"                     \
  452.               "selp.u16 %4, 1, 0, %%p0; }",                                    \
  453.               ("f"(__x), "f"(__y)));
  454. __IMPL_2DGATHER(0, "tld4.r.2d.v4");
  455. __IMPL_2DGATHER(1, "tld4.g.2d.v4");
  456. __IMPL_2DGATHER(2, "tld4.b.2d.v4");
  457. __IMPL_2DGATHER(3, "tld4.a.2d.v4");
  458.  
  459. // Umbrella dispatcher -- calls into specific 2Dgather variant.
  460. template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
  461.   template <class __T>
  462.   __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
  463.                               int __comp) {
  464.     switch (__comp) {
  465.     case 0:
  466.       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
  467.           __obj, __x, __y, __comp);
  468.     case 1:
  469.       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
  470.           __obj, __x, __y, __comp);
  471.     case 2:
  472.       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
  473.           __obj, __x, __y, __comp);
  474.     case 3:
  475.       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
  476.           __obj, __x, __y, __comp);
  477.     }
  478.   }
  479. };
  480. __IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
  481.  
  482. template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
  483.   template <class __T>
  484.   __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
  485.                                  float __y, int __comp) {
  486.     switch (__comp) {
  487.     case 0:
  488.       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
  489.           __obj, __x, __y, __comp);
  490.     case 1:
  491.       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
  492.           __obj, __x, __y, __comp);
  493.     case 2:
  494.       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
  495.           __obj, __x, __y, __comp);
  496.     case 3:
  497.       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
  498.           __obj, __x, __y, __comp);
  499.     }
  500.   }
  501. };
  502.  
  503. #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
  504. template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
  505.   template <class __T>
  506.   __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
  507.                               unsigned char *__ir, int __comp) {
  508.     switch (__comp) {
  509.     case 0:
  510.       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
  511.           __obj, __x, __y, __ir, __comp);
  512.     case 1:
  513.       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
  514.           __obj, __x, __y, __ir, __comp);
  515.     case 2:
  516.       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
  517.           __obj, __x, __y, __ir, __comp);
  518.     case 3:
  519.       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
  520.           __obj, __x, __y, __ir, __comp);
  521.     }
  522.   }
  523. };
  524. #endif
  525.  
  526. // 3D
  527. __IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
  528.          "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
  529.          ("f"(__x), "f"(__y), "f"(__z)));
  530. __IMPL_ALIAS("__itex3D", "__tex3D_v2");
  531.  
  532. __IMPL_S3S("__itex3D_sparse",
  533.            (float __x, float __y, float __z, unsigned char *__ir),
  534.            "{.reg .pred %%p0;\n\t"
  535.            "tex.3d.v4",
  536.            "f32",
  537.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
  538.            "selp.u16 %4, 1, 0, %%p0; }",
  539.            ("f"(__x), "f"(__y), "f"(__z)));
  540.  
  541. __IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
  542.          (float __x, float __y, float __z, const float4 *__dPdx,
  543.           const float4 *__dPdy),
  544.          "tex.grad.3d.v4", "f32",
  545.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
  546.          "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
  547.          ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
  548.           "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
  549. __IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
  550.  
  551. __IMPL_S3S("__itex3DGrad_sparse",
  552.            (float __x, float __y, float __z, const float4 *__dPdx,
  553.             const float4 *__dPdy, unsigned char *__ir),
  554.            "{.reg .pred %%p0;\n\t"
  555.            "tex.grad.3d.v4",
  556.            "f32",
  557.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
  558.            "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
  559.            "selp.u16 %4, 1, 0, %%p0; }",
  560.            ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
  561.             "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
  562.  
  563. __IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
  564.          (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
  565.          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
  566.          ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
  567. __IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
  568.  
  569. __IMPL_S3S("__itex3DLod_sparse",
  570.            (float __x, float __y, float __z, float __level,
  571.             unsigned char *__ir),
  572.            "{.reg .pred %%p0;\n\t"
  573.            "tex.level.3d.v4",
  574.            "f32",
  575.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
  576.            "selp.u16 %4, 1, 0, %%p0; }",
  577.            ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
  578.  
  579. // Cubemap
  580. __IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
  581.          (float __x, float __y, float __z), "tex.cube.v4", "f32",
  582.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
  583.          ("f"(__x), "f"(__y), "f"(__z)));
  584. __IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
  585.  
  586. __IMPL_S3S("__itexCubemap_sparse",
  587.            (float __x, float __y, float __z, unsigned char *__ir),
  588.            "{.reg .pred %%p0;\n\t"
  589.            "tex.cube.v4",
  590.            "f32",
  591.            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
  592.            "selp.u16 %4, 1, 0, %%p0; }",
  593.            ("f"(__x), "f"(__y), "f"(__z)));
  594.  
  595. __IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
  596.          (float __x, float __y, float __z, const float4 *__dPdx,
  597.           const float4 *__dPdy),
  598.          "tex.grad.cube.v4", "f32",
  599.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
  600.          "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
  601.          ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
  602.           "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
  603. __IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
  604.  
  605. __IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
  606.          (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
  607.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
  608.          ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
  609. __IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
  610.  
  611. __IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
  612.          (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
  613.           const float4 *__dPdy),
  614.          "tex.grad.acube.v4", "f32",
  615.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
  616.          "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
  617.          ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
  618.           "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
  619.           "f"(__dPdy->z)));
  620. __IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
  621.  
  622. __IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
  623.          (float __x, float __y, float __z, int __layer, float __level),
  624.          "tex.level.acube.v4", "f32",
  625.          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
  626.          ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
  627. __IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
  628.  
  629. __IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
  630.          (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
  631.          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
  632.          ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
  633. __IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
  634.  
  635. // Helper class for extracting slice of data from V4 fetch results.
  636. template <class __DestT, class __SrcT> struct __convert {
  637.   template <int __NElements = sizeof(__DestT) /
  638.                               sizeof(typename __TypeInfoT<__DestT>::__base_t)>
  639.   __device__ static __DestT __run(__SrcT __v);
  640.   template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
  641.   template <> __device__ static __DestT __run<2>(__SrcT __v) {
  642.     return {__v.x, __v.y};
  643.   }
  644.   template <> __device__ static __DestT __run<3>(__SrcT __v) {
  645.     return {__v.x, __v.y, __v.z};
  646.   }
  647.   template <> __device__ static __DestT __run<4>(__SrcT __v) {
  648.     return {__v.x, __v.y, __v.z, __v.w};
  649.   }
  650. };
  651.  
  652. // These are the top-level function overloads the __nv_tex_surf_handler expands
  653. // to.  Each overload deals with one of the several ways __nv_tex_surf_handler
  654. // is called by CUDA headers. In the end, each of the overloads does the same
  655. // job -- it figures out which `__tex_fetch_v4::run` variant should be used to
  656. // fetch texture data and which `__convert::run` is needed to convert it into
  657. // appropriate return type.
  658.  
  659. // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
  660. //   Data type and return type are based on ret.
  661. template <class __op, class __T, class... __Args>
  662. __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
  663.                                    __Args... __args) {
  664.   using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
  665.   *__ptr = __convert<__T, __FetchT>::__run(
  666.       __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
  667. }
  668.  
  669. #if CUDA_VERSION < 12000
  670. // texture<> objects get magically converted into a texture reference.  However,
  671. // there's no way to convert them to cudaTextureObject_t on C++ level. So, we
  672. // cheat a bit and use inline assembly to do it. It costs us an extra register
  673. // and a move, but that is easy for ptxas to optimize away.
  674. template <class __T>
  675. __device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
  676.   cudaTextureObject_t __obj;
  677.   asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
  678.   return __obj;
  679. }
  680.  
  681. // __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
  682. //   Data type and return type is based on ret.
  683. template <class __op, class __T, class __HandleT, class... __Args>
  684. __device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
  685.                                    __Args... __args) {
  686.   using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
  687.   *__ptr = __convert<__T, __FetchT>::__run(
  688.       __tex_fetch_v4<__op>::template __run<__FetchT>(
  689.           __tex_handle_to_obj(__handle), __args...));
  690. }
  691.  
  692. // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
  693. // cudaReadModeNormalizedFloat fetches always return float4.
  694. template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
  695. __device__ static void
  696. __tex_fetch(__DataT *, __RetT *__ptr,
  697.             texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
  698.             __Args... __args) {
  699.   using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
  700.   *__ptr = __convert<__RetT, float4>::__run(
  701.       __tex_fetch_v4<__op>::template __run<__FetchT>(
  702.           __tex_handle_to_obj(__handle), __args...));
  703. }
  704.  
  705. // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
  706. // For cudaReadModeElementType fetch return type is based on type_dummy.
  707. template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
  708. __device__ static void
  709. __tex_fetch(__DataT *, __RetT *__ptr,
  710.             texture<__DataT, __TexT, cudaReadModeElementType> __handle,
  711.             __Args... __args) {
  712.   using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
  713.   *__ptr = __convert<__RetT, __FetchT>::__run(
  714.       __tex_fetch_v4<__op>::template __run<__FetchT>(
  715.           __tex_handle_to_obj(__handle), __args...));
  716. }
  717. #endif // CUDA_VERSION
  718. } // namespace __cuda_tex
  719. } // namespace
  720. #pragma pop_macro("__ASM_OUT")
  721. #pragma pop_macro("__ASM_OUTP")
  722. #pragma pop_macro("__Args")
  723. #pragma pop_macro("__ID")
  724. #pragma pop_macro("__IDV")
  725. #pragma pop_macro("__IMPL_2DGATHER")
  726. #pragma pop_macro("__IMPL_ALIAS")
  727. #pragma pop_macro("__IMPL_ALIASI")
  728. #pragma pop_macro("__IMPL_F1")
  729. #pragma pop_macro("__IMPL_F3")
  730. #pragma pop_macro("__IMPL_F3N")
  731. #pragma pop_macro("__IMPL_F3S")
  732. #pragma pop_macro("__IMPL_S")
  733. #pragma pop_macro("__IMPL_S3")
  734. #pragma pop_macro("__IMPL_S3I")
  735. #pragma pop_macro("__IMPL_S3N")
  736. #pragma pop_macro("__IMPL_S3NI")
  737. #pragma pop_macro("__IMPL_S3S")
  738. #pragma pop_macro("__IMPL_S3SI")
  739. #pragma pop_macro("__IMPL_SI")
  740. #pragma pop_macro("__L")
  741. #pragma pop_macro("__STRIP_PARENS")
  742. #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
  743.