WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/__clang_cuda_texture_intrinsics.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*
		9	* This header provides in-header implmentations for NVCC's built-in
		10	* __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
		11	* built-in is unusual as it's actually a set of function overloads that use the
		12	* first string literal argument as one of the overload parameters.
		13	*/
		14	#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
		15	#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
		16	#ifndef __CUDA__
		17	#error "This file is for CUDA compilation only."
		18	#endif
		19
		20	// __nv_tex_surf_handler() provided by this header as a macro.
		21	#define __nv_tex_surf_handler(__op, __ptr, ...) \
		22	::__cuda_tex::__tex_fetch< \
		23	::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
		24	__VA_ARGS__)
		25
		26	#pragma push_macro("__ASM_OUT")
		27	#pragma push_macro("__ASM_OUTP")
		28	#pragma push_macro("__Args")
		29	#pragma push_macro("__ID")
		30	#pragma push_macro("__IDV")
		31	#pragma push_macro("__IMPL_2DGATHER")
		32	#pragma push_macro("__IMPL_ALIAS")
		33	#pragma push_macro("__IMPL_ALIASI")
		34	#pragma push_macro("__IMPL_F1")
		35	#pragma push_macro("__IMPL_F3")
		36	#pragma push_macro("__IMPL_F3N")
		37	#pragma push_macro("__IMPL_F3S")
		38	#pragma push_macro("__IMPL_S")
		39	#pragma push_macro("__IMPL_S3")
		40	#pragma push_macro("__IMPL_S3I")
		41	#pragma push_macro("__IMPL_S3N")
		42	#pragma push_macro("__IMPL_S3NI")
		43	#pragma push_macro("__IMPL_S3S")
		44	#pragma push_macro("__IMPL_S3SI")
		45	#pragma push_macro("__IMPL_SI")
		46	#pragma push_macro("__L")
		47	#pragma push_macro("__STRIP_PARENS")
		48
		49	// Put all functions into anonymous namespace so they have internal linkage.
		50	// The device-only function here must be internal in order to avoid ODR
		51	// violations in case they are used from the files compiled with
		52	// -fgpu-rdc. E.g. a library and an app using it may be built with a different
		53	// version of this header file.
		54	namespace {
		55
		56	// Put the implmentation into its own namespace so we don't pollute the TU.
		57	namespace __cuda_tex {
		58
		59	// First, we need a perfect hash function and a few constexpr helper functions
		60	// for converting a string literal into a numeric value which can be used to
		61	// parametrize a template. We can not use string literals for that as that would
		62	// require C++20.
		63	//
		64	// The hash function was generated with 'gperf' and then manually converted into
		65	// its constexpr equivalent.
		66	//
		67	// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
		68	// function has a collision for any of the texture operations, the compilation
		69	// will fail due to an attempt to redefine a tag with the same value. If the
		70	// header compiles, then the hash function is good enough for the job.
		71
		72	constexpr int __tex_len(const char *s) {
		73	return (s[0] == 0) ? 0
		74	: (s[1] == 0) ? 1
		75	: (s[2] == 0) ? 2
		76	: (s[3] == 0) ? 3
		77	: (s[4] == 0) ? 4
		78	: (s[5] == 0) ? 5
		79	: (s[6] == 0) ? 6
		80	: (s[7] == 0) ? 7
		81	: (s[8] == 0) ? 8
		82	: (s[9] == 0) ? 9
		83	: (s[10] == 0) ? 10
		84	: (s[11] == 0) ? 11
		85	: (s[12] == 0) ? 12
		86	: (s[13] == 0) ? 13
		87	: (s[14] == 0) ? 14
		88	: (s[15] == 0) ? 15
		89	: (s[16] == 0) ? 16
		90	: (s[17] == 0) ? 17
		91	: (s[18] == 0) ? 18
		92	: (s[19] == 0) ? 19
		93	: (s[20] == 0) ? 20
		94	: (s[21] == 0) ? 21
		95	: (s[22] == 0) ? 22
		96	: (s[23] == 0) ? 23
		97	: (s[24] == 0) ? 24
		98	: (s[25] == 0) ? 25
		99	: (s[26] == 0) ? 26
		100	: (s[27] == 0) ? 27
		101	: (s[28] == 0) ? 28
		102	: (s[29] == 0) ? 29
		103	: (s[30] == 0) ? 30
		104	: (s[31] == 0) ? 31
		105	: 32;
		106	}
		107
		108	constexpr int __tex_hash_map(int c) {
		109	return (c == 49) ? 10
		110	: (c == 50) ? 0
		111	: (c == 51) ? 100
		112	: (c == 52) ? 30
		113	: (c == 67) ? 10
		114	: (c == 68) ? 0
		115	: (c == 69) ? 25
		116	: (c == 72) ? 70
		117	: (c == 77) ? 0
		118	: (c == 96) ? 44
		119	: (c == 99) ? 10
		120	: (c == 100) ? 5
		121	: (c == 101) ? 60
		122	: (c == 102) ? 40
		123	: (c == 103) ? 70
		124	: (c == 104) ? 25
		125	: (c == 112) ? 0
		126	: (c == 114) ? 45
		127	: (c == 117) ? 5
		128	: (c == 118) ? 85
		129	: (c == 120) ? 20
		130	: 225;
		131	}
		132
		133	constexpr int __tex_op_hash(const char *str) {
		134	return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
		135	__tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
		136	}
		137
		138	// Tag type to identify particular texture operation.
		139	template <int N> struct __Tag;
		140	#define __ID(__op) __Tag<__tex_op_hash(__op)>
		141	// Tags for variants of particular operation. E.g. tex2Dgather can translate
		142	// into 4 different instructions.
		143	#define __IDV(__op, __variant) \
		144	__Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
		145
		146	// Helper classes for figuring out key data types for derived types.
		147	// E.g. char2 has __base_t = char, __fetch_t = char4
		148	template <class> struct __TypeInfoT;
		149	// Type info for the fundamental types.
		150	template <> struct __TypeInfoT<float> {
		151	using __base_t = float;
		152	using __fetch_t = float4;
		153	};
		154	template <> struct __TypeInfoT<char> {
		155	using __base_t = char;
		156	using __fetch_t = int4;
		157	};
		158	template <> struct __TypeInfoT<signed char> {
		159	using __base_t = signed char;
		160	using __fetch_t = int4;
		161	};
		162	template <> struct __TypeInfoT<unsigned char> {
		163	using __base_t = unsigned char;
		164	using __fetch_t = uint4;
		165	};
		166	template <> struct __TypeInfoT<short> {
		167	using __base_t = short;
		168	using __fetch_t = int4;
		169	};
		170	template <> struct __TypeInfoT<unsigned short> {
		171	using __base_t = unsigned short;
		172	using __fetch_t = uint4;
		173	};
		174	template <> struct __TypeInfoT<int> {
		175	using __base_t = int;
		176	using __fetch_t = int4;
		177	};
		178	template <> struct __TypeInfoT<unsigned int> {
		179	using __base_t = unsigned int;
		180	using __fetch_t = uint4;
		181	};
		182
		183	// Derived base/fetch types for N-element vectors.
		184	template <class __T> struct __TypeInfoT {
		185	using __base_t = decltype(__T::x);
		186	using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
		187	};
		188
		189	// Classes that implement specific texture ops.
		190	template <class __op> struct __tex_fetch_v4;
		191
		192	// Helper macros to strip parens from a macro argument.
		193	#define __Args(...) __VA_ARGS__
		194	#define __STRIP_PARENS(__X) __X
		195	#define __L(__X) __STRIP_PARENS(__Args __X)
		196
		197	// Construct inline assembly output args.
		198	// Results are stored in a temp var __r.
		199	// isResident bool is pointed to by __ir
		200	// Asm args for return values. It's a 4-element vector
		201	#define __ASM_OUT(__t) \
		202	("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
		203	// .. possibly combined with a predicate.
		204	#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
		205
		206	// Implements a single variant of texture fetch instruction.
		207	#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
		208	template <> \
		209	__device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
		210	__rt __r; \
		211	asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
		212	return __r; \
		213	}
		214
		215	// Implements texture fetch instructions for int4/uint4/float4 data types.
		216	#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		217	__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
		218	__ASM_OUT("r"), __asm_args) \
		219	__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
		220	__ASM_OUT("r"), __asm_args) \
		221	__IMPL_F1(float4, float4, __args, \
		222	__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
		223	__asm_args)
		224	// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
		225	// types. Similar to above, but returns a boolean 'isPresent' value in addition
		226	// to texture data,
		227	#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		228	__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
		229	__ASM_OUTP("r"), __asm_args) \
		230	__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
		231	__ASM_OUTP("r"), __asm_args) \
		232	__IMPL_F1(float4, float4, __args, \
		233	__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
		234	__asm_args)
		235
		236	// Similar to F3, but for integer data which is returned as normalized floats.
		237	// Only instantiates fetch functions for int4/uint4.
		238	#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		239	__IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
		240	__ASM_OUT("r"), __asm_args) \
		241	__IMPL_F1(float4, uint4, __args, \
		242	__asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
		243	__asm_args)
		244
		245	// Instantiates __tex_fetch_v4 with regular fetch functions.
		246	#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		247	template <> struct __tex_fetch_v4<__op> { \
		248	template <class T> \
		249	__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
		250	__IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		251	}
		252
		253	// Same, but for sparse ops. Only available on sm_60+
		254	#if !defined(__CUDA_ARCH__) \|\| (__CUDA_ARCH__ >= 600)
		255	#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
		256	__asm_args) \
		257	template <> struct __tex_fetch_v4<__op> { \
		258	template <class T> \
		259	__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
		260	__IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		261	}
		262	#else
		263	#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
		264	#endif
		265
		266	// Same, but for normalized float ops.
		267	#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
		268	__asm_args) \
		269	template <> struct __tex_fetch_v4<__op> { \
		270	template <class T> \
		271	__device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
		272	__IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		273	}
		274
		275	// Regular and normalized float ops share a lot of similarities. This macro
		276	// instantiates both variants -- normal for __op and normalized for __opn.
		277	#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
		278	__asm_args) \
		279	__IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
		280	__IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
		281
		282	// Convenience macros which converts string literal __op into a __Tag,
		283	#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		284	__IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
		285	#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		286	__IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
		287	#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
		288	__IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
		289	#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
		290	__asm_args) \
		291	__IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
		292	__asm_args)
		293
		294	// CUDA headers have some 'legacy' texture oprerations that duplicate
		295	// functionality. So, we just inherit it, instead of refining a copy.
		296	#define __IMPL_ALIASI(__op, __opn) \
		297	template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
		298	#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
		299
		300	// Now we can instantiate everything we need for each specific texture fetch
		301	// variant.
		302	__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
		303	"{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
		304	__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
		305	"s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
		306	__IMPL_ALIAS("__itex1D", "__tex1D_v2");
		307	__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
		308
		309	__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
		310	(float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
		311	"{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
		312	("f"(__x), "f"(__dPdx), "f"(__dPdy)));
		313	__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
		314
		315	__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
		316	(float __x, int __layer), "tex.a1d.v4", "f32",
		317	"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
		318	__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
		319
		320	__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
		321	(float __x, int __layer, float __dPdx, float __dPdy),
		322	"tex.grad.a1d.v4", "f32",
		323	"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
		324	("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
		325	__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
		326
		327	__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
		328	(float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
		329	"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
		330	("r"(__layer), "f"(__x), "f"(__level)));
		331	__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
		332
		333	__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
		334	"tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
		335	("f"(__x), "f"(__level)));
		336	__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
		337
		338	// 2D
		339	__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
		340	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
		341	__IMPL_ALIAS("__itex2D", "__tex2D_v2");
		342
		343	__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
		344	"{.reg .pred %%p0;\n\t"
		345	"tex.2d.v4",
		346	"f32",
		347	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}];\n\t"
		348	" selp.u16 %4, 1, 0, %%p0; }",
		349	("f"(__x), "f"(__y)));
		350
		351	__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
		352	(float __x, float __y, const float2 __dPdx, const float2 __dPdy),
		353	"tex.grad.2d.v4", "f32",
		354	"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
		355	("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
		356	"f"(__dPdy->y)));
		357	__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
		358
		359	__IMPL_S3S("__itex2DGrad_sparse",
		360	(float __x, float __y, const float2 __dPdx, const float2 __dPdy,
		361	unsigned char *__ir),
		362	"{.reg .pred %%p0;\n\t"
		363	"tex.grad.2d.v4",
		364	"f32",
		365	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
		366	"selp.u16 %4, 1, 0, %%p0; }",
		367	("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
		368	"f"(__dPdy->y)));
		369
		370	__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
		371	(float __x, float __y, int __layer), "tex.a2d.v4", "f32",
		372	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
		373	("r"(__layer), "f"(__x), "f"(__y)));
		374	__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
		375
		376	__IMPL_S3S("__itex2DLayered_sparse",
		377	(float __x, float __y, int __layer, unsigned char *__ir),
		378	"{.reg .pred %%p0;\n\t"
		379	"tex.a2d.v4",
		380	"f32",
		381	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
		382	"selp.u16 %4, 1, 0, %%p0; }",
		383	("r"(__layer), "f"(__x), "f"(__y)));
		384
		385	__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
		386	(float __x, float __y, int __layer, const float2 *__dPdx,
		387	const float2 *__dPdy),
		388	"tex.grad.a2d.v4", "f32",
		389	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
		390	("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
		391	"f"(__dPdy->x), "f"(__dPdy->y)));
		392	__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
		393
		394	__IMPL_S3S(
		395	"__itex2DLayeredGrad_sparse",
		396	(float __x, float __y, int __layer, const float2 *__dPdx,
		397	const float2 __dPdy, unsigned char __ir),
		398	"{.reg .pred %%p0;\n\t"
		399	"tex.grad.a2d.v4",
		400	"f32",
		401	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
		402	"selp.u16 %4, 1, 0, %%p0; }",
		403	("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
		404	"f"(__dPdy->x), "f"(__dPdy->y)));
		405
		406	__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
		407	(float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
		408	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
		409	("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
		410	__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
		411
		412	__IMPL_S3S("__itex2DLayeredLod_sparse",
		413	(float __x, float __y, int __layer, float __level,
		414	unsigned char *__ir),
		415	"{.reg .pred %%p0;\n\t"
		416	"tex.level.a2d.v4",
		417	"f32",
		418	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
		419	"selp.u16 %4, 1, 0, %%p0; }",
		420	("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
		421
		422	__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
		423	(float __x, float __y, float __level), "tex.level.2d.v4", "f32",
		424	"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
		425	("f"(__x), "f"(__y), "f"(__level)));
		426	__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
		427
		428	__IMPL_S3S("__itex2DLod_sparse",
		429	(float __x, float __y, float __level, unsigned char *__ir),
		430	"{.reg .pred %%p0;\n\t"
		431	"tex.level.2d.v4",
		432	"f32",
		433	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}], %8;\n\t"
		434	"selp.u16 %4, 1, 0, %%p0; }",
		435	("f"(__x), "f"(__y), "f"(__level)));
		436
		437	// 2D gather is special. Unlike other variants that translate into exactly one
		438	// asm instruction, it uses one of the four different instructions selected by
		439	// __comp. We implement each instruction variant separately, and dispatch the
		440	// right one from the manually implemented 'umbrella' fetch.
		441	#define __IMPL_2DGATHER(variant, instr) \
		442	__IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
		443	__IDV("__tex2Dgather_rmnf_v2", variant), \
		444	(float __x, float __y, int __comp), instr, "f32", \
		445	"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
		446	__IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
		447	__IDV("__tex2Dgather_v2", variant)); \
		448	__IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
		449	(float __x, float __y, unsigned char *__ir, int __comp), \
		450	"{.reg .pred %%p0;\n\t" instr, "f32", \
		451	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}];\n\t" \
		452	"selp.u16 %4, 1, 0, %%p0; }", \
		453	("f"(__x), "f"(__y)));
		454	__IMPL_2DGATHER(0, "tld4.r.2d.v4");
		455	__IMPL_2DGATHER(1, "tld4.g.2d.v4");
		456	__IMPL_2DGATHER(2, "tld4.b.2d.v4");
		457	__IMPL_2DGATHER(3, "tld4.a.2d.v4");
		458
		459	// Umbrella dispatcher -- calls into specific 2Dgather variant.
		460	template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
		461	template <class __T>
		462	__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
		463	int __comp) {
		464	switch (__comp) {
		465	case 0:
		466	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
		467	__obj, __x, __y, __comp);
		468	case 1:
		469	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
		470	__obj, __x, __y, __comp);
		471	case 2:
		472	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
		473	__obj, __x, __y, __comp);
		474	case 3:
		475	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
		476	__obj, __x, __y, __comp);
		477	}
		478	}
		479	};
		480	__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
		481
		482	template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
		483	template <class __T>
		484	__device__ static float4 __run(cudaTextureObject_t __obj, float __x,
		485	float __y, int __comp) {
		486	switch (__comp) {
		487	case 0:
		488	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
		489	__obj, __x, __y, __comp);
		490	case 1:
		491	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
		492	__obj, __x, __y, __comp);
		493	case 2:
		494	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
		495	__obj, __x, __y, __comp);
		496	case 3:
		497	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
		498	__obj, __x, __y, __comp);
		499	}
		500	}
		501	};
		502
		503	#if !defined(__CUDA_ARCH__) \|\| (__CUDA_ARCH__ >= 600)
		504	template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
		505	template <class __T>
		506	__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
		507	unsigned char *__ir, int __comp) {
		508	switch (__comp) {
		509	case 0:
		510	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
		511	__obj, __x, __y, __ir, __comp);
		512	case 1:
		513	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
		514	__obj, __x, __y, __ir, __comp);
		515	case 2:
		516	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
		517	__obj, __x, __y, __ir, __comp);
		518	case 3:
		519	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
		520	__obj, __x, __y, __ir, __comp);
		521	}
		522	}
		523	};
		524	#endif
		525
		526	// 3D
		527	__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
		528	"tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
		529	("f"(__x), "f"(__y), "f"(__z)));
		530	__IMPL_ALIAS("__itex3D", "__tex3D_v2");
		531
		532	__IMPL_S3S("__itex3D_sparse",
		533	(float __x, float __y, float __z, unsigned char *__ir),
		534	"{.reg .pred %%p0;\n\t"
		535	"tex.3d.v4",
		536	"f32",
		537	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
		538	"selp.u16 %4, 1, 0, %%p0; }",
		539	("f"(__x), "f"(__y), "f"(__z)));
		540
		541	__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
		542	(float __x, float __y, float __z, const float4 *__dPdx,
		543	const float4 *__dPdy),
		544	"tex.grad.3d.v4", "f32",
		545	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
		546	"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
		547	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
		548	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
		549	__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
		550
		551	__IMPL_S3S("__itex3DGrad_sparse",
		552	(float __x, float __y, float __z, const float4 *__dPdx,
		553	const float4 __dPdy, unsigned char __ir),
		554	"{.reg .pred %%p0;\n\t"
		555	"tex.grad.3d.v4",
		556	"f32",
		557	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], "
		558	"{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
		559	"selp.u16 %4, 1, 0, %%p0; }",
		560	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
		561	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
		562
		563	__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
		564	(float __x, float __y, float __z, float __level), "tex.level.3d.v4",
		565	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
		566	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
		567	__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
		568
		569	__IMPL_S3S("__itex3DLod_sparse",
		570	(float __x, float __y, float __z, float __level,
		571	unsigned char *__ir),
		572	"{.reg .pred %%p0;\n\t"
		573	"tex.level.3d.v4",
		574	"f32",
		575	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
		576	"selp.u16 %4, 1, 0, %%p0; }",
		577	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
		578
		579	// Cubemap
		580	__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
		581	(float __x, float __y, float __z), "tex.cube.v4", "f32",
		582	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
		583	("f"(__x), "f"(__y), "f"(__z)));
		584	__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
		585
		586	__IMPL_S3S("__itexCubemap_sparse",
		587	(float __x, float __y, float __z, unsigned char *__ir),
		588	"{.reg .pred %%p0;\n\t"
		589	"tex.cube.v4",
		590	"f32",
		591	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
		592	"selp.u16 %4, 1, 0, %%p0; }",
		593	("f"(__x), "f"(__y), "f"(__z)));
		594
		595	__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
		596	(float __x, float __y, float __z, const float4 *__dPdx,
		597	const float4 *__dPdy),
		598	"tex.grad.cube.v4", "f32",
		599	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
		600	"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
		601	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
		602	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
		603	__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
		604
		605	__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
		606	(float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
		607	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
		608	("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
		609	__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
		610
		611	__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
		612	(float __x, float __y, float __z, int __layer, const float4 *__dPdx,
		613	const float4 *__dPdy),
		614	"tex.grad.acube.v4", "f32",
		615	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
		616	"{%9, %10, %11, %11}, {%12, %13, %14, %14};",
		617	("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
		618	"f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
		619	"f"(__dPdy->z)));
		620	__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
		621
		622	__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
		623	(float __x, float __y, float __z, int __layer, float __level),
		624	"tex.level.acube.v4", "f32",
		625	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
		626	("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
		627	__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
		628
		629	__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
		630	(float __x, float __y, float __z, float __level), "tex.level.cube.v4",
		631	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
		632	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
		633	__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
		634
		635	// Helper class for extracting slice of data from V4 fetch results.
		636	template <class __DestT, class __SrcT> struct __convert {
		637	template <int __NElements = sizeof(__DestT) /
		638	sizeof(typename __TypeInfoT<__DestT>::__base_t)>
		639	__device__ static __DestT __run(__SrcT __v);
		640	template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
		641	template <> __device__ static __DestT __run<2>(__SrcT __v) {
		642	return {__v.x, __v.y};
		643	}
		644	template <> __device__ static __DestT __run<3>(__SrcT __v) {
		645	return {__v.x, __v.y, __v.z};
		646	}
		647	template <> __device__ static __DestT __run<4>(__SrcT __v) {
		648	return {__v.x, __v.y, __v.z, __v.w};
		649	}
		650	};
		651
		652	// These are the top-level function overloads the __nv_tex_surf_handler expands
		653	// to. Each overload deals with one of the several ways __nv_tex_surf_handler
		654	// is called by CUDA headers. In the end, each of the overloads does the same
		655	// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
		656	// fetch texture data and which `__convert::run` is needed to convert it into
		657	// appropriate return type.
		658
		659	// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
		660	// Data type and return type are based on ret.
		661	template <class __op, class __T, class... __Args>
		662	__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
		663	__Args... __args) {
		664	using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
		665	*__ptr = __convert<__T, __FetchT>::__run(
		666	__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
		667	}
		668
		669	#if CUDA_VERSION < 12000
		670	// texture<> objects get magically converted into a texture reference. However,
		671	// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
		672	// cheat a bit and use inline assembly to do it. It costs us an extra register
		673	// and a move, but that is easy for ptxas to optimize away.
		674	template <class __T>
		675	__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
		676	cudaTextureObject_t __obj;
		677	asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
		678	return __obj;
		679	}
		680
		681	// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
		682	// Data type and return type is based on ret.
		683	template <class __op, class __T, class __HandleT, class... __Args>
		684	__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
		685	__Args... __args) {
		686	using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
		687	*__ptr = __convert<__T, __FetchT>::__run(
		688	__tex_fetch_v4<__op>::template __run<__FetchT>(
		689	__tex_handle_to_obj(__handle), __args...));
		690	}
		691
		692	// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
		693	// cudaReadModeNormalizedFloat fetches always return float4.
		694	template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
		695	__device__ static void
		696	__tex_fetch(__DataT , __RetT __ptr,
		697	texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
		698	__Args... __args) {
		699	using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
		700	*__ptr = __convert<__RetT, float4>::__run(
		701	__tex_fetch_v4<__op>::template __run<__FetchT>(
		702	__tex_handle_to_obj(__handle), __args...));
		703	}
		704
		705	// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
		706	// For cudaReadModeElementType fetch return type is based on type_dummy.
		707	template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
		708	__device__ static void
		709	__tex_fetch(__DataT , __RetT __ptr,
		710	texture<__DataT, __TexT, cudaReadModeElementType> __handle,
		711	__Args... __args) {
		712	using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
		713	*__ptr = __convert<__RetT, __FetchT>::__run(
		714	__tex_fetch_v4<__op>::template __run<__FetchT>(
		715	__tex_handle_to_obj(__handle), __args...));
		716	}
		717	#endif // CUDA_VERSION
		718	} // namespace __cuda_tex
		719	} // namespace
		720	#pragma pop_macro("__ASM_OUT")
		721	#pragma pop_macro("__ASM_OUTP")
		722	#pragma pop_macro("__Args")
		723	#pragma pop_macro("__ID")
		724	#pragma pop_macro("__IDV")
		725	#pragma pop_macro("__IMPL_2DGATHER")
		726	#pragma pop_macro("__IMPL_ALIAS")
		727	#pragma pop_macro("__IMPL_ALIASI")
		728	#pragma pop_macro("__IMPL_F1")
		729	#pragma pop_macro("__IMPL_F3")
		730	#pragma pop_macro("__IMPL_F3N")
		731	#pragma pop_macro("__IMPL_F3S")
		732	#pragma pop_macro("__IMPL_S")
		733	#pragma pop_macro("__IMPL_S3")
		734	#pragma pop_macro("__IMPL_S3I")
		735	#pragma pop_macro("__IMPL_S3N")
		736	#pragma pop_macro("__IMPL_S3NI")
		737	#pragma pop_macro("__IMPL_S3S")
		738	#pragma pop_macro("__IMPL_S3SI")
		739	#pragma pop_macro("__IMPL_SI")
		740	#pragma pop_macro("__L")
		741	#pragma pop_macro("__STRIP_PARENS")
		742	#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/__clang_cuda_texture_intrinsics.h – Rev 14