WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/include/llvm/IR/IntrinsicsAMDGPU.td

Rev	Author	Line No.	Line
14	pmbaty	1	//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics ------ tablegen --===//
		2	//
		3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	// See https://llvm.org/LICENSE.txt for license information.
		5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	//
		7	//===----------------------------------------------------------------------===//
		8	//
		9	// This file defines all of the R600-specific intrinsics.
		10	//
		11	//===----------------------------------------------------------------------===//
		12
		13	class AMDGPUReadPreloadRegisterIntrinsic
		14	: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
		15
		16	class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
		17	: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
		18
		19	// Used to tag image and resource intrinsics with information used to generate
		20	// mem operands.
		21	class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
		22	int RsrcArg = rsrcarg;
		23	bit IsImage = isimage;
		24	}
		25
		26	let TargetPrefix = "r600" in {
		27
		28	multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
		29	def _x : AMDGPUReadPreloadRegisterIntrinsic;
		30	def _y : AMDGPUReadPreloadRegisterIntrinsic;
		31	def _z : AMDGPUReadPreloadRegisterIntrinsic;
		32	}
		33
		34	multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
		35	def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
		36	def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
		37	def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
		38	}
		39
		40	defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
		41	<"__builtin_r600_read_global_size">;
		42	defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
		43	<"__builtin_r600_read_ngroups">;
		44	defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
		45	<"__builtin_r600_read_tgid">;
		46
		47	defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
		48	defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
		49
		50	def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
		51	Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
		52
		53	// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
		54	def int_r600_implicitarg_ptr :
		55	ClangBuiltin<"__builtin_r600_implicitarg_ptr">,
		56	DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
		57	[IntrNoMem, IntrSpeculatable]>;
		58
		59	def int_r600_rat_store_typed :
		60	// 1st parameter: Data
		61	// 2nd parameter: Index
		62	// 3rd parameter: Constant RAT ID
		63	DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>,
		64	ClangBuiltin<"__builtin_r600_rat_store_typed">;
		65
		66	def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic<
		67	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		68	>;
		69
		70	def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic<
		71	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		72	>;
		73
		74	def int_r600_cube : DefaultAttrsIntrinsic<
		75	[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
		76	>;
		77
		78	def int_r600_store_stream_output : DefaultAttrsIntrinsic<
		79	[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
		80	>;
		81
		82	class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [
		83	llvm_v4f32_ty, // Coord
		84	llvm_i32_ty, // offset_x
		85	llvm_i32_ty, // offset_y,
		86	llvm_i32_ty, // offset_z,
		87	llvm_i32_ty, // resource_id
		88	llvm_i32_ty, // samplerid
		89	llvm_i32_ty, // coord_type_x
		90	llvm_i32_ty, // coord_type_y
		91	llvm_i32_ty, // coord_type_z
		92	llvm_i32_ty], // coord_type_w
		93	[IntrNoMem]
		94	>;
		95
		96	class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [
		97	llvm_v4i32_ty, // Coord
		98	llvm_i32_ty, // offset_x
		99	llvm_i32_ty, // offset_y,
		100	llvm_i32_ty, // offset_z,
		101	llvm_i32_ty, // resource_id
		102	llvm_i32_ty, // samplerid
		103	llvm_i32_ty, // coord_type_x
		104	llvm_i32_ty, // coord_type_y
		105	llvm_i32_ty, // coord_type_z
		106	llvm_i32_ty], // coord_type_w
		107	[IntrNoMem]
		108	>;
		109
		110	def int_r600_store_swizzle :
		111	Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
		112	>;
		113
		114	def int_r600_tex : TextureIntrinsicFloatInput;
		115	def int_r600_texc : TextureIntrinsicFloatInput;
		116	def int_r600_txl : TextureIntrinsicFloatInput;
		117	def int_r600_txlc : TextureIntrinsicFloatInput;
		118	def int_r600_txb : TextureIntrinsicFloatInput;
		119	def int_r600_txbc : TextureIntrinsicFloatInput;
		120	def int_r600_txf : TextureIntrinsicInt32Input;
		121	def int_r600_txq : TextureIntrinsicInt32Input;
		122	def int_r600_ddx : TextureIntrinsicFloatInput;
		123	def int_r600_ddy : TextureIntrinsicFloatInput;
		124
		125	def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty],
		126	[llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
		127	>;
		128
		129	def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>;
		130
		131	} // End TargetPrefix = "r600"
		132
		133	let TargetPrefix = "amdgcn" in {
		134
		135	//===----------------------------------------------------------------------===//
		136	// ABI Special Intrinsics
		137	//===----------------------------------------------------------------------===//
		138
		139	defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
		140	defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
		141	<"__builtin_amdgcn_workgroup_id">;
		142
		143	def int_amdgcn_dispatch_ptr :
		144	DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
		145	[Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
		146
		147	def int_amdgcn_queue_ptr :
		148	ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
		149	DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
		150	[Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
		151
		152	def int_amdgcn_kernarg_segment_ptr :
		153	ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
		154	DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
		155	[Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
		156
		157	def int_amdgcn_implicitarg_ptr :
		158	ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
		159	DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
		160	[Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
		161
		162	def int_amdgcn_groupstaticsize :
		163	ClangBuiltin<"__builtin_amdgcn_groupstaticsize">,
		164	DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
		165
		166	def int_amdgcn_dispatch_id :
		167	ClangBuiltin<"__builtin_amdgcn_dispatch_id">,
		168	DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
		169
		170	// For internal use. Coordinates LDS lowering between IR transform and backend.
		171	def int_amdgcn_lds_kernel_id :
		172	DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
		173
		174	def int_amdgcn_implicit_buffer_ptr :
		175	ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
		176	DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
		177	[Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
		178
		179	// Set EXEC to the 64-bit value given.
		180	// This is always moved to the beginning of the basic block.
		181	// FIXME: Should be mangled for wave size.
		182	def int_amdgcn_init_exec : Intrinsic<[],
		183	[llvm_i64_ty], // 64-bit literal constant
		184	[IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback,
		185	IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>;
		186
		187	// Set EXEC according to a thread count packed in an SGPR input:
		188	// thread_count = (input >> bitoffset) & 0x7f;
		189	// This is always moved to the beginning of the basic block.
		190	// Note: only inreg arguments to the parent function are valid as
		191	// inputs to this intrinsic, computed values cannot be used.
		192	def int_amdgcn_init_exec_from_input : Intrinsic<[],
		193	[llvm_i32_ty, // 32-bit SGPR input
		194	llvm_i32_ty], // bit offset of the thread count
		195	[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
		196	IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
		197
		198	def int_amdgcn_wavefrontsize :
		199	ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
		200	DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
		201
		202
		203	//===----------------------------------------------------------------------===//
		204	// Instruction Intrinsics
		205	//===----------------------------------------------------------------------===//
		206
		207	// The first parameter is s_sendmsg immediate (i16),
		208	// the second one is copied to m0
		209	def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
		210	Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
		211	[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
		212	def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
		213	Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
		214	[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
		215
		216
		217	// gfx11 intrinsic
		218	// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
		219	def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
		220	[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
		221
		222	def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
		223	Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		224
		225	def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
		226	Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		227
		228	// The 1st parameter is a mask for the types of instructions that may be allowed
		229	// to cross the SCHED_BARRIER during scheduling.
		230	// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
		231	// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
		232	// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
		233	// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
		234	// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
		235	// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER.
		236	// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
		237	// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
		238	// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
		239	// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
		240	// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
		241	// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
		242	def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
		243	Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
		244	IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		245
		246	// The first parameter is a mask that determines the types of instructions that
		247	// you would like to synchronize around and add to a scheduling group. The
		248	// values of the mask are defined above for sched_barrier. These instructions
		249	// will be selected from the bottom up starting from the sched_group_barrier's
		250	// location during instruction scheduling. The second parameter is the number of
		251	// matching instructions that will be associated with this sched_group_barrier.
		252	// The third parameter is an identifier which is used to describe what other
		253	// sched_group_barriers should be synchronized with.
		254	def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
		255	Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		256	[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
		257	IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		258
		259	// Scheduler optimization hint.
		260	// MASK = 0: Small gemm opt
		261	def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
		262	Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
		263	IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		264
		265	def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
		266	Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		267
		268	def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
		269	// 1st parameter: Numerator
		270	// 2nd parameter: Denominator
		271	// 3rd parameter: Select quotient. Must equal Numerator or Denominator.
		272	// (0 = Denominator, 1 = Numerator).
		273	[llvm_anyfloat_ty, llvm_i1_ty],
		274	[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
		275	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
		276	>;
		277
		278	def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
		279	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
		280	[IntrNoMem, IntrSpeculatable]
		281	>;
		282
		283	def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
		284	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
		285	[IntrNoMem, IntrSpeculatable]
		286	>;
		287
		288	// Look Up 2.0 / pi src0 with segment select src1[4:0]
		289	def int_amdgcn_trig_preop : DefaultAttrsIntrinsic<
		290	[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
		291	[IntrNoMem, IntrSpeculatable]
		292	>;
		293
		294	def int_amdgcn_sin : DefaultAttrsIntrinsic<
		295	[llvm_anyfloat_ty], [LLVMMatchType<0>],
		296	[IntrNoMem, IntrSpeculatable]
		297	>;
		298
		299	def int_amdgcn_cos : DefaultAttrsIntrinsic<
		300	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		301	>;
		302
		303	def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
		304	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		305	>;
		306
		307	def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
		308	DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
		309	[IntrNoMem, IntrSpeculatable, Commutative]
		310	>;
		311
		312	// Fused single-precision multiply-add with legacy behaviour for the multiply,
		313	// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
		314	// intended for use on subtargets that have the v_fma_legacy_f32 and/or
		315	// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
		316	// has a completely different kind of legacy behaviour.)
		317	def int_amdgcn_fma_legacy :
		318	DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
		319	[IntrNoMem, IntrSpeculatable, Commutative]
		320	>;
		321
		322	def int_amdgcn_rcp : DefaultAttrsIntrinsic<
		323	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		324	>;
		325
		326	def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
		327	DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty],
		328	[IntrNoMem, IntrSpeculatable]
		329	>;
		330
		331	def int_amdgcn_sqrt : DefaultAttrsIntrinsic<
		332	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		333	>;
		334
		335	def int_amdgcn_rsq : DefaultAttrsIntrinsic<
		336	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		337	>;
		338
		339	def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
		340	DefaultAttrsIntrinsic<
		341	[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
		342	>;
		343
		344	// out = 1.0 / sqrt(a) result clamped to +/- max_float.
		345	def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic<
		346	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
		347
		348	// For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used.
		349	def int_amdgcn_ldexp : DefaultAttrsIntrinsic<
		350	[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
		351	[IntrNoMem, IntrSpeculatable]
		352	>;
		353
		354	def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic<
		355	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		356	>;
		357
		358	def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic<
		359	[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
		360	>;
		361
		362	// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
		363	// and always uses rtz, so is not suitable for implementing the OpenCL
		364	// fract function. It should be ok on VI.
		365	def int_amdgcn_fract : DefaultAttrsIntrinsic<
		366	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
		367	>;
		368
		369	def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
		370	DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
		371	[IntrNoMem, IntrSpeculatable]
		372	>;
		373
		374	def int_amdgcn_cvt_pknorm_i16 :
		375	ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
		376	DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
		377	[IntrNoMem, IntrSpeculatable]
		378	>;
		379
		380	def int_amdgcn_cvt_pknorm_u16 :
		381	ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
		382	DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
		383	[IntrNoMem, IntrSpeculatable]
		384	>;
		385
		386	def int_amdgcn_cvt_pk_i16 :
		387	ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
		388	DefaultAttrsIntrinsic<
		389	[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
		390	[IntrNoMem, IntrSpeculatable]
		391	>;
		392
		393	def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
		394	DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
		395	[IntrNoMem, IntrSpeculatable]
		396	>;
		397
		398	def int_amdgcn_class : DefaultAttrsIntrinsic<
		399	[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
		400	[IntrNoMem, IntrSpeculatable]
		401	>;
		402
		403	def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">,
		404	DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
		405	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
		406	[IntrNoMem, IntrSpeculatable]
		407	>;
		408
		409	def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
		410	DefaultAttrsIntrinsic<[llvm_float_ty],
		411	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
		412	[IntrNoMem, IntrSpeculatable]
		413	>;
		414
		415	def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
		416	DefaultAttrsIntrinsic<[llvm_float_ty],
		417	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
		418	[IntrNoMem, IntrSpeculatable]
		419	>;
		420
		421	def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
		422	DefaultAttrsIntrinsic<[llvm_float_ty],
		423	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
		424	[IntrNoMem, IntrSpeculatable]
		425	>;
		426
		427	def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
		428	DefaultAttrsIntrinsic<[llvm_float_ty],
		429	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
		430	[IntrNoMem, IntrSpeculatable]
		431	>;
		432
		433	// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
		434	// should be used.
		435	def int_amdgcn_sffbh :
		436	DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
		437	[IntrNoMem, IntrSpeculatable]
		438	>;
		439
		440	// v_mad_f32\|f16/v_mac_f32\|f16, selected regardless of denorm support.
		441	def int_amdgcn_fmad_ftz :
		442	DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
		443	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
		444	[IntrNoMem, IntrSpeculatable]
		445	>;
		446
		447	// Fields should mirror atomicrmw
		448	class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
		449	[llvm_anyptr_ty,
		450	LLVMMatchType<0>,
		451	llvm_i32_ty, // ordering
		452	llvm_i32_ty, // scope
		453	llvm_i1_ty], // isVolatile
		454	[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
		455	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], "",
		456	[SDNPMemOperand]
		457	>;
		458
		459	def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
		460	def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
		461
		462	class AMDGPULDSIntrin :
		463	Intrinsic<[llvm_any_ty],
		464	[LLVMQualPointerType<LLVMMatchType<0>, 3>,
		465	LLVMMatchType<0>,
		466	llvm_i32_ty, // ordering
		467	llvm_i32_ty, // scope
		468	llvm_i1_ty], // isVolatile
		469	[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
		470	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]
		471	>;
		472
		473	// FIXME: The m0 argument should be moved after the normal arguments
		474	class AMDGPUDSOrderedIntrinsic : Intrinsic<
		475	[llvm_i32_ty],
		476	// M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that
		477	// the bit packing can be optimized at the IR level.
		478	[LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0)
		479	llvm_i32_ty, // value to add or swap
		480	llvm_i32_ty, // ordering
		481	llvm_i32_ty, // scope
		482	llvm_i1_ty, // isVolatile
		483	llvm_i32_ty, // ordered count index (OA index), also added to the address
		484	// gfx10: bits 24-27 indicate the number of active threads/dwords
		485	llvm_i1_ty, // wave release, usually set to 1
		486	llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
		487	[IntrWillReturn, NoCapture<ArgIndex<0>>,
		488	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
		489	ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree
		490	]
		491	>;
		492
		493	class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
		494	[llvm_i32_ty],
		495	[llvm_anyptr_ty, // LDS or GDS ptr
		496	llvm_i1_ty], // isVolatile
		497	[IntrConvergent, IntrWillReturn, IntrArgMemOnly,
		498	NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
		499	"",
		500	[SDNPMemOperand]
		501	>;
		502
		503	def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
		504	def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
		505
		506	// The pointer argument is assumed to be dynamically uniform if a VGPR.
		507	def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
		508	def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
		509
		510	def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
		511	def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
		512	def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
		513
		514	} // TargetPrefix = "amdgcn"
		515
		516	// New-style image intrinsics
		517
		518	//////////////////////////////////////////////////////////////////////////
		519	// Dimension-aware image intrinsics framework
		520	//////////////////////////////////////////////////////////////////////////
		521
		522	// Helper class to represent (type, name) combinations of arguments. The
		523	// argument names are explanatory and used as DAG operand names for codegen
		524	// pattern matching.
		525	class AMDGPUArg<LLVMType ty, string name> {
		526	LLVMType Type = ty;
		527	string Name = name;
		528	}
		529
		530	// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
		531	class makeArgList<list<string> names, LLVMType basety> {
		532	list<AMDGPUArg> ret =
		533	!listconcat([AMDGPUArg<basety, names[0]>],
		534	!foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
		535	}
		536
		537	// Return arglist, with LLVMMatchType's references shifted by 'shift'.
		538	class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
		539	list<AMDGPUArg> ret =
		540	!foreach(arg, arglist,
		541	!if(!isa<LLVMMatchType>(arg.Type),
		542	AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
		543	arg.Name>,
		544	arg));
		545	}
		546
		547	// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
		548	// accordingly, and shifted by an additional 'shift'.
		549	class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
		550	list<AMDGPUArg> ret =
		551	!foldl([]<AMDGPUArg>, arglists, lhs, rhs,
		552	!listconcat(
		553	lhs,
		554	arglistmatchshift<rhs,
		555	!add(shift, !foldl(0, lhs, a, b,
		556	!add(a, b.Type.isAny)))>.ret));
		557	}
		558
		559	// Represent texture/image types / dimensionality.
		560	class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix,
		561	list<string> coord_names, list<string> slice_names,
		562	bit msaa = 0> {
		563	AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
		564	string Name = name; // e.g. "2darraymsaa"
		565	string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
		566	bits<3> Encoding = enc;
		567	bit DA = 0; // DA bit in MIMG encoding
		568	bit MSAA = msaa;
		569
		570	list<AMDGPUArg> CoordSliceArgs =
		571	makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
		572	list<AMDGPUArg> CoordSliceIntArgs =
		573	makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
		574	list<AMDGPUArg> GradientArgs =
		575	makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
		576	!foreach(name, coord_names, "d" # name # "dv")),
		577	llvm_anyfloat_ty>.ret;
		578
		579	bits<8> NumCoords = !size(CoordSliceArgs);
		580	bits<8> NumGradients = !size(GradientArgs);
		581	}
		582
		583	def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>;
		584	def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
		585	def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>;
		586	let DA = 1 in {
		587	def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>;
		588	def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>;
		589	def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
		590	}
		591	def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
		592	let DA = 1 in {
		593	def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
		594	}
		595
		596	def AMDGPUDims {
		597	list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
		598	AMDGPUDimCube, AMDGPUDim1DArray,
		599	AMDGPUDim2DArray];
		600	list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
		601	list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
		602	}
		603
		604	// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
		605	class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
		606	string UpperCaseMod = ucmod;
		607	string LowerCaseMod = lcmod;
		608
		609	// {offset} {bias} {z-compare}
		610	list<AMDGPUArg> ExtraAddrArgs = extra_addr;
		611	bit Offset = false;
		612	bit Bias = false;
		613	bit ZCompare = false;
		614	bit Gradients = false;
		615
		616	// Name of the {lod} or {clamp} argument that is appended to the coordinates,
		617	// if any.
		618	string LodOrClamp = "";
		619	}
		620
		621	// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
		622	// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
		623	defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
		624	multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
		625	list<AMDGPUArg> extra_addr> {
		626	def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
		627	let Offset = true in
		628	def NAME#lcmod#_o : AMDGPUSampleVariant<
		629	ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
		630	}
		631
		632	multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
		633	list<AMDGPUArg> extra_addr> {
		634	defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
		635	let ZCompare = true in
		636	defm NAME : AMDGPUSampleHelper_Offset<
		637	"_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
		638	}
		639
		640	multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
		641	list<AMDGPUArg> extra_addr> {
		642	defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
		643	let LodOrClamp = "clamp" in
		644	defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
		645	}
		646
		647	defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
		648	defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
		649	let Bias = true in
		650	defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
		651	"_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
		652	let LodOrClamp = "lod" in
		653	defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
		654	defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
		655	}
		656
		657	let Gradients = true in {
		658	defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
		659	defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
		660	}
		661	}
		662
		663	// Helper class to capture the profile of a dimension-aware image intrinsic.
		664	// This information is used to generate the intrinsic's type and to inform
		665	// codegen pattern matching.
		666	class AMDGPUDimProfile<string opmod,
		667	AMDGPUDimProps dim> {
		668	AMDGPUDimProps Dim = dim;
		669	string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
		670
		671	// These are intended to be overwritten by subclasses
		672	bit IsSample = false;
		673	bit IsAtomic = false;
		674	list<LLVMType> RetTypes = [];
		675	list<AMDGPUArg> DataArgs = [];
		676	list<AMDGPUArg> ExtraAddrArgs = [];
		677	bit Offset = false;
		678	bit Bias = false;
		679	bit ZCompare = false;
		680	bit Gradients = false;
		681	string LodClampMip = "";
		682
		683	int NumRetAndDataAnyTypes =
		684	!foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
		685	!add(a, b.isAny));
		686
		687	list<AMDGPUArg> AddrArgs =
		688	arglistconcat<[ExtraAddrArgs,
		689	!if(Gradients, dim.GradientArgs, []),
		690	!listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
		691	!if(!empty(LodClampMip),
		692	[]<AMDGPUArg>,
		693	[AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
		694	NumRetAndDataAnyTypes>.ret;
		695	list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
		696	list<AMDGPUArg> AddrDefaultArgs =
		697	!foreach(arg, AddrArgs,
		698	AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
		699	!if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
		700	arg.Name>);
		701	list<AMDGPUArg> AddrA16Args =
		702	!foreach(arg, AddrArgs,
		703	AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
		704	!if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
		705	arg.Name>);
		706	}
		707
		708	class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
		709	let IsSample = base.IsSample;
		710	let IsAtomic = base.IsAtomic;
		711	let RetTypes = base.RetTypes;
		712	let DataArgs = base.DataArgs;
		713	let ExtraAddrArgs = base.ExtraAddrArgs;
		714	let Offset = base.Offset;
		715	let Bias = base.Bias;
		716	let ZCompare = base.ZCompare;
		717	let Gradients = base.Gradients;
		718	let LodClampMip = base.LodClampMip;
		719	}
		720
		721	class AMDGPUDimSampleProfile<string opmod,
		722	AMDGPUDimProps dim,
		723	AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
		724	let IsSample = true;
		725	let RetTypes = [llvm_any_ty];
		726	let ExtraAddrArgs = sample.ExtraAddrArgs;
		727	let Offset = sample.Offset;
		728	let Bias = sample.Bias;
		729	let ZCompare = sample.ZCompare;
		730	let Gradients = sample.Gradients;
		731	let LodClampMip = sample.LodOrClamp;
		732	}
		733
		734	class AMDGPUDimNoSampleProfile<string opmod,
		735	AMDGPUDimProps dim,
		736	list<LLVMType> retty,
		737	list<AMDGPUArg> dataargs,
		738	bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
		739	let RetTypes = retty;
		740	let DataArgs = dataargs;
		741	let LodClampMip = !if(Mip, "mip", "");
		742	}
		743
		744	class AMDGPUDimAtomicProfile<string opmod,
		745	AMDGPUDimProps dim,
		746	list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
		747	let RetTypes = [llvm_anyint_ty];
		748	let DataArgs = dataargs;
		749	let IsAtomic = true;
		750	}
		751
		752	class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
		753	list<AMDGPUArg> dataargs>
		754	: AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
		755	let RetTypes = [llvm_anyfloat_ty];
		756	}
		757
		758	class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
		759	: AMDGPUDimProfile<"GET_RESINFO", dim> {
		760	let RetTypes = [llvm_anyfloat_ty];
		761	let DataArgs = [];
		762	let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
		763	let LodClampMip = "mip";
		764	}
		765
		766	// Helper class for figuring out image intrinsic argument indexes.
		767	class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
		768	int NumDataArgs = !size(P_.DataArgs);
		769	int NumDmaskArgs = !not(P_.IsAtomic);
		770	int NumOffsetArgs = !if(P_.Offset, 1, 0);
		771	int NumBiasArgs = !if(P_.Bias, 1, 0);
		772	int NumZCompareArgs = !if(P_.ZCompare, 1, 0);
		773	int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs);
		774	int NumVAddrArgs = !size(P_.AddrArgs);
		775	int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
		776	int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
		777	int NumRSrcArgs = 1;
		778	int NumSampArgs = !if(P_.IsSample, 2, 0);
		779	int DmaskArgIndex = NumDataArgs;
		780	int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
		781	int OffsetArgIndex = VAddrArgIndex;
		782	int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs);
		783	int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs);
		784	int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
		785	int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
		786	int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
		787	int MipArgIndex = LodArgIndex;
		788	int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs);
		789	int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs);
		790	int UnormArgIndex = !add(SampArgIndex, 1);
		791	int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
		792	int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
		793	}
		794
		795	// All dimension-aware intrinsics are derived from this class.
		796	class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
		797	list<IntrinsicProperty> props,
		798	list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic<
		799	P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return
		800	!listconcat(
		801	!foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic
		802	!if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm)
		803	P_.AddrTypes, // vaddr(VGPR)
		804	[llvm_v8i32_ty], // rsrc(SGPR)
		805	!if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR)
		806	llvm_i1_ty], []), // unorm(imm)
		807	[llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
		808	llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
		809
		810	!listconcat(props,
		811	!if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
		812	!if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
		813	[ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
		814	ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
		815
		816
		817	"", sdnodeprops>,
		818	AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
		819	!if(P_.IsAtomic, 0, 1)), 1> {
		820	AMDGPUDimProfile P = P_;
		821
		822	AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
		823
		824	let TargetPrefix = "amdgcn";
		825	}
		826
		827	// Marker class for intrinsics with a DMask that determines the returned
		828	// channels.
		829	class AMDGPUImageDMaskIntrinsic;
		830
		831	defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
		832
		833	//////////////////////////////////////////////////////////////////////////
		834	// Load and store intrinsics
		835	//////////////////////////////////////////////////////////////////////////
		836	multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
		837	list<LLVMType> retty,
		838	list<AMDGPUArg> dataargs,
		839	list<IntrinsicProperty> props,
		840	list<SDNodeProperty> sdnodeprops,
		841	bit Mip = false> {
		842	foreach dim = AMDGPUDims.NoMsaa in {
		843	def !strconcat(NAME, "_", dim.Name)
		844	: AMDGPUImageDimIntrinsic<
		845	AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
		846	props, sdnodeprops>;
		847	}
		848	}
		849
		850	multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
		851	list<LLVMType> retty,
		852	list<AMDGPUArg> dataargs,
		853	list<IntrinsicProperty> props,
		854	list<SDNodeProperty> sdnodeprops,
		855	bit Mip = false> {
		856	foreach dim = AMDGPUDims.All in {
		857	def !strconcat(NAME, "_", dim.Name)
		858	: AMDGPUImageDimIntrinsic<
		859	AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
		860	props, sdnodeprops>;
		861	}
		862	}
		863
		864	defm int_amdgcn_image_load
		865	: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
		866	[SDNPMemOperand]>,
		867	AMDGPUImageDMaskIntrinsic;
		868	defm int_amdgcn_image_load_mip
		869	: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
		870	[IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
		871	AMDGPUImageDMaskIntrinsic;
		872
		873	defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
		874	"STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
		875	[IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>;
		876	defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
		877	"STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
		878	[IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>;
		879
		880	//////////////////////////////////////////////////////////////////////////
		881	// MSAA intrinsics
		882	//////////////////////////////////////////////////////////////////////////
		883	foreach dim = AMDGPUDims.Msaa in {
		884	def int_amdgcn_image_msaa_load_x # _ # dim.Name:
		885	AMDGPUImageDimIntrinsic<
		886	AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>,
		887	[IntrReadMem], [SDNPMemOperand]>;
		888	}
		889
		890	foreach dim = AMDGPUDims.Msaa in {
		891	def int_amdgcn_image_msaa_load # _ # dim.Name:
		892	AMDGPUImageDimIntrinsic<
		893	AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>,
		894	[IntrReadMem], [SDNPMemOperand]>;
		895	}
		896
		897	//////////////////////////////////////////////////////////////////////////
		898	// sample and getlod intrinsics
		899	//////////////////////////////////////////////////////////////////////////
		900	multiclass AMDGPUImageDimSampleDims<string opmod,
		901	AMDGPUSampleVariant sample,
		902	bit NoMem = false> {
		903	foreach dim = AMDGPUDims.NoMsaa in {
		904	def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
		905	AMDGPUDimSampleProfile<opmod, dim, sample>,
		906	!if(NoMem, [IntrNoMem], [IntrReadMem]),
		907	!if(NoMem, [], [SDNPMemOperand])>;
		908	}
		909	}
		910
		911	foreach sample = AMDGPUSampleVariants in {
		912	defm int_amdgcn_image_sample # sample.LowerCaseMod
		913	: AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
		914	AMDGPUImageDMaskIntrinsic;
		915	}
		916
		917	defm int_amdgcn_image_getlod
		918	: AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
		919	AMDGPUImageDMaskIntrinsic;
		920
		921	//////////////////////////////////////////////////////////////////////////
		922	// getresinfo intrinsics
		923	//////////////////////////////////////////////////////////////////////////
		924	foreach dim = AMDGPUDims.All in {
		925	def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
		926	: AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
		927	AMDGPUImageDMaskIntrinsic;
		928	}
		929
		930	//////////////////////////////////////////////////////////////////////////
		931	// gather4 intrinsics
		932	//////////////////////////////////////////////////////////////////////////
		933	foreach sample = AMDGPUSampleVariantsNoGradients in {
		934	foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
		935	def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
		936	AMDGPUImageDimIntrinsic<
		937	AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
		938	[IntrReadMem], [SDNPMemOperand]>;
		939	}
		940	}
		941	}
		942
		943	//////////////////////////////////////////////////////////////////////////
		944	// atomic intrinsics
		945	//////////////////////////////////////////////////////////////////////////
		946	defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
		947	multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
		948	int isFloat = 0> {
		949	foreach dim = AMDGPUDims.All in {
		950	def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
		951	!if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
		952	AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
		953	[], [SDNPMemOperand]>;
		954	}
		955	}
		956
		957	multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
		958	defm ""
		959	: AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
		960	isFloat>;
		961	}
		962
		963	multiclass AMDGPUImageDimFloatAtomic<string opmod> {
		964	defm "" : AMDGPUImageDimAtomic<opmod, 1 /isFloat/>;
		965	}
		966
		967	defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
		968	defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
		969	defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
		970	defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
		971	defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
		972	defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
		973	defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
		974	defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
		975	defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
		976	defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
		977	defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
		978	defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
		979	defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
		980	defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
		981
		982	defm int_amdgcn_image_atomic_cmpswap :
		983	AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
		984	AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
		985	}
		986
		987	//////////////////////////////////////////////////////////////////////////
		988	// Buffer intrinsics
		989	//////////////////////////////////////////////////////////////////////////
		990
		991	let TargetPrefix = "amdgcn" in {
		992
		993	defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
		994
		995	class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
		996	[data_ty],
		997	[llvm_v4i32_ty, // rsrc(SGPR)
		998	llvm_i32_ty, // vindex(VGPR)
		999	llvm_i32_ty, // offset(SGPR/VGPR/imm)
		1000	llvm_i1_ty, // glc(imm)
		1001	llvm_i1_ty], // slc(imm)
		1002	[IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
		1003	AMDGPURsrcIntrinsic<0>;
		1004	def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
		1005	def int_amdgcn_buffer_load : AMDGPUBufferLoad;
		1006
		1007	// Generate a buffer_load instruction that may be optimized to s_buffer_load if
		1008	// the offset argument is uniform.
		1009	def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
		1010	[llvm_any_ty],
		1011	[llvm_v4i32_ty, // rsrc(SGPR)
		1012	llvm_i32_ty, // byte offset
		1013	llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
		1014	[IntrNoMem, ImmArg<ArgIndex<2>>]>,
		1015	AMDGPURsrcIntrinsic<0>;
		1016
		1017	class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
		1018	[],
		1019	[data_ty, // vdata(VGPR)
		1020	llvm_v4i32_ty, // rsrc(SGPR)
		1021	llvm_i32_ty, // vindex(VGPR)
		1022	llvm_i32_ty, // offset(SGPR/VGPR/imm)
		1023	llvm_i1_ty, // glc(imm)
		1024	llvm_i1_ty], // slc(imm)
		1025	[IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
		1026	AMDGPURsrcIntrinsic<1>;
		1027	def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
		1028	def int_amdgcn_buffer_store : AMDGPUBufferStore;
		1029
		1030	// New buffer intrinsics with separate raw and struct variants. The raw
		1031	// variant never has an index. The struct variant always has an index, even if
		1032	// it is const 0. A struct intrinsic with constant 0 index is different to the
		1033	// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
		1034	// and swizzling changes depending on whether idxen is set in the instruction.
		1035	// These new instrinsics also keep the offset and soffset arguments separate as
		1036	// they behave differently in bounds checking and swizzling.
		1037	class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
		1038	[data_ty],
		1039	[llvm_v4i32_ty, // rsrc(SGPR)
		1040	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1041	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1042	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1043	// bit 1 = slc,
		1044	// bit 2 = dlc on gfx10+),
		1045	// swizzled buffer (bit 3 = swz))
		1046	[IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
		1047	AMDGPURsrcIntrinsic<0>;
		1048	def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
		1049	def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
		1050
		1051	class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
		1052	[data_ty],
		1053	[llvm_v4i32_ty, // rsrc(SGPR)
		1054	llvm_i32_ty, // vindex(VGPR)
		1055	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1056	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1057	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1058	// bit 1 = slc,
		1059	// bit 2 = dlc on gfx10+),
		1060	// swizzled buffer (bit 3 = swz))
		1061	[IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
		1062	AMDGPURsrcIntrinsic<0>;
		1063	def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
		1064	def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
		1065
		1066	class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
		1067	[],
		1068	[data_ty, // vdata(VGPR)
		1069	llvm_v4i32_ty, // rsrc(SGPR)
		1070	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1071	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1072	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1073	// bit 1 = slc,
		1074	// bit 2 = dlc on gfx10+),
		1075	// swizzled buffer (bit 3 = swz))
		1076	[IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
		1077	AMDGPURsrcIntrinsic<1>;
		1078	def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
		1079	def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
		1080
		1081	class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
		1082	[],
		1083	[data_ty, // vdata(VGPR)
		1084	llvm_v4i32_ty, // rsrc(SGPR)
		1085	llvm_i32_ty, // vindex(VGPR)
		1086	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1087	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1088	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1089	// bit 1 = slc,
		1090	// bit 2 = dlc on gfx10+),
		1091	// swizzled buffer (bit 3 = swz))
		1092	[IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
		1093	AMDGPURsrcIntrinsic<1>;
		1094	def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
		1095	def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
		1096
		1097	class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
		1098	!if(NoRtn, [], [data_ty]),
		1099	[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
		1100	llvm_v4i32_ty, // rsrc(SGPR)
		1101	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1102	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1103	llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
		1104	[ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1105	AMDGPURsrcIntrinsic<1, 0>;
		1106	def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
		1107	def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
		1108	def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
		1109	def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
		1110	def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
		1111	def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
		1112	def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
		1113	def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
		1114	def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
		1115	def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
		1116	def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
		1117	def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
		1118	def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
		1119	def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
		1120	def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
		1121	[llvm_anyint_ty],
		1122	[LLVMMatchType<0>, // src(VGPR)
		1123	LLVMMatchType<0>, // cmp(VGPR)
		1124	llvm_v4i32_ty, // rsrc(SGPR)
		1125	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1126	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1127	llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
		1128	[ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1129	AMDGPURsrcIntrinsic<2, 0>;
		1130
		1131	// gfx908 intrinsic
		1132	def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
		1133
		1134	class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
		1135	!if(NoRtn, [], [data_ty]),
		1136	[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
		1137	llvm_v4i32_ty, // rsrc(SGPR)
		1138	llvm_i32_ty, // vindex(VGPR)
		1139	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1140	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1141	llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
		1142	[ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1143	AMDGPURsrcIntrinsic<1, 0>;
		1144	def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
		1145	def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
		1146	def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
		1147	def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
		1148	def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
		1149	def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
		1150	def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
		1151	def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
		1152	def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
		1153	def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
		1154	def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
		1155	def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
		1156	def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
		1157	[llvm_anyint_ty],
		1158	[LLVMMatchType<0>, // src(VGPR)
		1159	LLVMMatchType<0>, // cmp(VGPR)
		1160	llvm_v4i32_ty, // rsrc(SGPR)
		1161	llvm_i32_ty, // vindex(VGPR)
		1162	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1163	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1164	llvm_i32_ty], // cachepolicy(imm; bit 1 = slc)
		1165	[ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1166	AMDGPURsrcIntrinsic<2, 0>;
		1167
		1168	// gfx908 intrinsic
		1169	def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
		1170
		1171	// gfx90a intrinsics
		1172	def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
		1173	def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
		1174
		1175
		1176	// Obsolescent tbuffer intrinsics.
		1177	def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic <
		1178	[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
		1179	[llvm_v4i32_ty, // rsrc(SGPR)
		1180	llvm_i32_ty, // vindex(VGPR)
		1181	llvm_i32_ty, // voffset(VGPR)
		1182	llvm_i32_ty, // soffset(SGPR)
		1183	llvm_i32_ty, // offset(imm)
		1184	llvm_i32_ty, // dfmt(imm)
		1185	llvm_i32_ty, // nfmt(imm)
		1186	llvm_i1_ty, // glc(imm)
		1187	llvm_i1_ty], // slc(imm)
		1188	[IntrReadMem,
		1189	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
		1190	ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
		1191	AMDGPURsrcIntrinsic<0>;
		1192
		1193	def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic <
		1194	[],
		1195	[llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
		1196	llvm_v4i32_ty, // rsrc(SGPR)
		1197	llvm_i32_ty, // vindex(VGPR)
		1198	llvm_i32_ty, // voffset(VGPR)
		1199	llvm_i32_ty, // soffset(SGPR)
		1200	llvm_i32_ty, // offset(imm)
		1201	llvm_i32_ty, // dfmt(imm)
		1202	llvm_i32_ty, // nfmt(imm)
		1203	llvm_i1_ty, // glc(imm)
		1204	llvm_i1_ty], // slc(imm)
		1205	[IntrWriteMem, ImmArg<ArgIndex<5>>,
		1206	ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
		1207	ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
		1208	AMDGPURsrcIntrinsic<1>;
		1209
		1210	// New tbuffer intrinsics, with:
		1211	// - raw and struct variants
		1212	// - joint format field
		1213	// - joint cachepolicy field
		1214	def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic <
		1215	[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
		1216	[llvm_v4i32_ty, // rsrc(SGPR)
		1217	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1218	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1219	llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
		1220	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1221	// bit 1 = slc,
		1222	// bit 2 = dlc on gfx10+),
		1223	// swizzled buffer (bit 3 = swz))
		1224	[IntrReadMem,
		1225	ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
		1226	AMDGPURsrcIntrinsic<0>;
		1227
		1228	def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic <
		1229	[],
		1230	[llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
		1231	llvm_v4i32_ty, // rsrc(SGPR)
		1232	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1233	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1234	llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
		1235	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1236	// bit 1 = slc,
		1237	// bit 2 = dlc on gfx10+),
		1238	// swizzled buffer (bit 3 = swz))
		1239	[IntrWriteMem,
		1240	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
		1241	AMDGPURsrcIntrinsic<1>;
		1242
		1243	def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic <
		1244	[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
		1245	[llvm_v4i32_ty, // rsrc(SGPR)
		1246	llvm_i32_ty, // vindex(VGPR)
		1247	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1248	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1249	llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
		1250	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1251	// bit 1 = slc,
		1252	// bit 2 = dlc on gfx10+),
		1253	// swizzled buffer (bit 3 = swz))
		1254	[IntrReadMem,
		1255	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
		1256	AMDGPURsrcIntrinsic<0>;
		1257
		1258	def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
		1259	[],
		1260	[llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
		1261	llvm_v4i32_ty, // rsrc(SGPR)
		1262	llvm_i32_ty, // vindex(VGPR)
		1263	llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
		1264	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1265	llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
		1266	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1267	// bit 1 = slc,
		1268	// bit 2 = dlc on gfx10+),
		1269	// swizzled buffer (bit 3 = swz))
		1270	[IntrWriteMem,
		1271	ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
		1272	AMDGPURsrcIntrinsic<1>;
		1273
		1274	class AMDGPUBufferAtomic : Intrinsic <
		1275	[llvm_anyint_ty],
		1276	[LLVMMatchType<0>, // vdata(VGPR)
		1277	llvm_v4i32_ty, // rsrc(SGPR)
		1278	llvm_i32_ty, // vindex(VGPR)
		1279	llvm_i32_ty, // offset(SGPR/VGPR/imm)
		1280	llvm_i1_ty], // slc(imm)
		1281	[ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1282	AMDGPURsrcIntrinsic<1, 0>;
		1283	def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
		1284	def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
		1285	def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
		1286	def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
		1287	def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
		1288	def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
		1289	def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
		1290	def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
		1291	def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
		1292	def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
		1293	def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
		1294	[llvm_i32_ty],
		1295	[llvm_i32_ty, // src(VGPR)
		1296	llvm_i32_ty, // cmp(VGPR)
		1297	llvm_v4i32_ty, // rsrc(SGPR)
		1298	llvm_i32_ty, // vindex(VGPR)
		1299	llvm_i32_ty, // offset(SGPR/VGPR/imm)
		1300	llvm_i1_ty], // slc(imm)
		1301	[ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1302	AMDGPURsrcIntrinsic<2, 0>;
		1303
		1304	def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
		1305
		1306	class AMDGPUBufferAtomicFP : Intrinsic <
		1307	[llvm_anyfloat_ty],
		1308	[LLVMMatchType<0>, // vdata(VGPR)
		1309	llvm_v4i32_ty, // rsrc(SGPR)
		1310	llvm_i32_ty, // vindex(VGPR)
		1311	llvm_i32_ty, // offset(SGPR/VGPR/imm)
		1312	llvm_i1_ty], // slc(imm)
		1313	[ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
		1314	AMDGPURsrcIntrinsic<1, 0>;
		1315
		1316	// Legacy form of the intrinsic. raw and struct forms should be preferred.
		1317	def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
		1318
		1319	class AMDGPURawBufferLoadLDS : Intrinsic <
		1320	[],
		1321	[llvm_v4i32_ty, // rsrc(SGPR)
		1322	LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
		1323	llvm_i32_ty, // Data byte size: 1/2/4
		1324	llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
		1325	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1326	llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
		1327	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1328	// bit 1 = slc,
		1329	// bit 2 = dlc on gfx10+))
		1330	// swizzled buffer (bit 3 = swz))
		1331	[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
		1332	ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
		1333	def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
		1334
		1335	class AMDGPUStructBufferLoadLDS : Intrinsic <
		1336	[],
		1337	[llvm_v4i32_ty, // rsrc(SGPR)
		1338	LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
		1339	llvm_i32_ty, // Data byte size: 1/2/4
		1340	llvm_i32_ty, // vindex(VGPR)
		1341	llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling)
		1342	llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
		1343	llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling)
		1344	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc,
		1345	// bit 1 = slc,
		1346	// bit 2 = dlc on gfx10+))
		1347	// swizzled buffer (bit 3 = swz))
		1348	[IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
		1349	ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
		1350	def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
		1351
		1352	} // defset AMDGPUBufferIntrinsics
		1353
		1354	// Uses that do not set the done bit should set IntrWriteMem on the
		1355	// call site.
		1356	def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [
		1357	llvm_i32_ty, // tgt,
		1358	llvm_i32_ty, // en
		1359	llvm_any_ty, // src0 (f32 or i32)
		1360	LLVMMatchType<0>, // src1
		1361	LLVMMatchType<0>, // src2
		1362	LLVMMatchType<0>, // src3
		1363	llvm_i1_ty, // done
		1364	llvm_i1_ty // vm (ignored on GFX11+)
		1365	],
		1366	[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
		1367	ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly]
		1368	>;
		1369
		1370	// exp with row_en bit set. Only supported on GFX11+.
		1371	def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [
		1372	llvm_i32_ty, // tgt,
		1373	llvm_i32_ty, // en
		1374	llvm_any_ty, // src0 (f32 or i32)
		1375	LLVMMatchType<0>, // src1
		1376	LLVMMatchType<0>, // src2
		1377	LLVMMatchType<0>, // src3
		1378	llvm_i1_ty, // done
		1379	llvm_i32_ty], // row number
		1380	[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
		1381	IntrWriteMem, IntrInaccessibleMemOnly]
		1382	>;
		1383
		1384	// exp with compr bit set. Not supported on GFX11+.
		1385	def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [
		1386	llvm_i32_ty, // tgt,
		1387	llvm_i32_ty, // en
		1388	llvm_anyvector_ty, // src0 (v2f16 or v2i16)
		1389	LLVMMatchType<0>, // src1
		1390	llvm_i1_ty, // done
		1391	llvm_i1_ty], // vm
		1392	[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>,
		1393	ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly]
		1394	>;
		1395
		1396	def int_amdgcn_buffer_wbinvl1_sc :
		1397	ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
		1398	DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
		1399
		1400	def int_amdgcn_buffer_wbinvl1 :
		1401	ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
		1402	DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
		1403
		1404	def int_amdgcn_s_dcache_inv :
		1405	ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">,
		1406	DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
		1407
		1408	def int_amdgcn_s_memtime :
		1409	ClangBuiltin<"__builtin_amdgcn_s_memtime">,
		1410	DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>;
		1411
		1412	def int_amdgcn_s_sleep :
		1413	ClangBuiltin<"__builtin_amdgcn_s_sleep">,
		1414	DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
		1415	IntrHasSideEffects]> {
		1416	}
		1417
		1418	def int_amdgcn_s_incperflevel :
		1419	ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
		1420	DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
		1421	IntrHasSideEffects]> {
		1422	}
		1423
		1424	def int_amdgcn_s_decperflevel :
		1425	ClangBuiltin<"__builtin_amdgcn_s_decperflevel">,
		1426	DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
		1427	IntrHasSideEffects]> {
		1428	}
		1429
		1430	def int_amdgcn_s_sethalt :
		1431	DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
		1432	IntrHasSideEffects]>;
		1433
		1434	def int_amdgcn_s_setprio :
		1435	ClangBuiltin<"__builtin_amdgcn_s_setprio">,
		1436	DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
		1437	IntrHasSideEffects]>;
		1438
		1439	// This is IntrHasSideEffects so it can be used to read cycle counters.
		1440	def int_amdgcn_s_getreg :
		1441	ClangBuiltin<"__builtin_amdgcn_s_getreg">,
		1442	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty],
		1443	[IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
		1444	>;
		1445
		1446	// Note this can be used to set FP environment properties that are
		1447	// unsafe to change in non-strictfp functions. The register properties
		1448	// available (and value required to access them) may differ per
		1449	// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
		1450	def int_amdgcn_s_setreg :
		1451	ClangBuiltin<"__builtin_amdgcn_s_setreg">,
		1452	DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty],
		1453	[IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
		1454	>;
		1455
		1456	// int_amdgcn_s_getpc is provided to allow a specific style of position
		1457	// independent code to determine the high part of its address when it is
		1458	// known (through convention) that the code and any data of interest does
		1459	// not cross a 4Gb address boundary. Use for any other purpose may not
		1460	// produce the desired results as optimizations may cause code movement,
		1461	// especially as we explicitly use IntrNoMem to allow optimizations.
		1462	def int_amdgcn_s_getpc :
		1463	ClangBuiltin<"__builtin_amdgcn_s_getpc">,
		1464	DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,
		1465	IntrWillReturn]>;
		1466
		1467	// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
		1468	// param values: 0 = P10, 1 = P20, 2 = P0
		1469	def int_amdgcn_interp_mov :
		1470	ClangBuiltin<"__builtin_amdgcn_interp_mov">,
		1471	DefaultAttrsIntrinsic<[llvm_float_ty],
		1472	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1473	[IntrNoMem, IntrSpeculatable,
		1474	ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
		1475
		1476	// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
		1477	// This intrinsic reads from lds, but the memory values are constant,
		1478	// so it behaves like IntrNoMem.
		1479	def int_amdgcn_interp_p1 :
		1480	ClangBuiltin<"__builtin_amdgcn_interp_p1">,
		1481	DefaultAttrsIntrinsic<[llvm_float_ty],
		1482	[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1483	[IntrNoMem, IntrSpeculatable,
		1484	ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
		1485
		1486	// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
		1487	def int_amdgcn_interp_p2 :
		1488	ClangBuiltin<"__builtin_amdgcn_interp_p2">,
		1489	DefaultAttrsIntrinsic<[llvm_float_ty],
		1490	[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1491	[IntrNoMem, IntrSpeculatable,
		1492	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
		1493	// See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
		1494
		1495	// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
		1496	// high selects whether high or low 16-bits are loaded from LDS
		1497	def int_amdgcn_interp_p1_f16 :
		1498	ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">,
		1499	DefaultAttrsIntrinsic<[llvm_float_ty],
		1500	[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
		1501	[IntrNoMem, IntrSpeculatable,
		1502	ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
		1503
		1504	// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
		1505	// high selects whether high or low 16-bits are loaded from LDS
		1506	def int_amdgcn_interp_p2_f16 :
		1507	ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">,
		1508	DefaultAttrsIntrinsic<[llvm_half_ty],
		1509	[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
		1510	[IntrNoMem, IntrSpeculatable,
		1511	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
		1512
		1513	// llvm.amdgcn.lds.direct.load <m0>
		1514	// The input argument is m0, which contains a packed combination of address
		1515	// offset and flags describing the data type.
		1516	def int_amdgcn_lds_direct_load :
		1517	DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
		1518	[llvm_i32_ty],
		1519	[IntrReadMem, IntrSpeculatable]>;
		1520
		1521	// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0>
		1522	// Like interp intrinsics, this reads from lds, but the memory values are constant,
		1523	// so it behaves like IntrNoMem.
		1524	def int_amdgcn_lds_param_load :
		1525	DefaultAttrsIntrinsic<[llvm_float_ty],
		1526	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1527	[IntrNoMem, IntrSpeculatable,
		1528	ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
		1529
		1530	// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0>
		1531	def int_amdgcn_interp_inreg_p10 :
		1532	DefaultAttrsIntrinsic<[llvm_float_ty],
		1533	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
		1534	[IntrNoMem, IntrSpeculatable]>;
		1535
		1536	// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp>
		1537	def int_amdgcn_interp_inreg_p2 :
		1538	DefaultAttrsIntrinsic<[llvm_float_ty],
		1539	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
		1540	[IntrNoMem, IntrSpeculatable]>;
		1541
		1542	// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high>
		1543	// high selects whether high or low 16-bits are used for p and p0 operands
		1544	def int_amdgcn_interp_inreg_p10_f16:
		1545	DefaultAttrsIntrinsic<[llvm_float_ty],
		1546	[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
		1547	[IntrNoMem, IntrSpeculatable,
		1548	ImmArg<ArgIndex<3>>]>;
		1549
		1550	// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high>
		1551	// high selects whether high or low 16-bits are used for p operand
		1552	def int_amdgcn_interp_inreg_p2_f16 :
		1553	DefaultAttrsIntrinsic<[llvm_half_ty],
		1554	[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
		1555	[IntrNoMem, IntrSpeculatable,
		1556	ImmArg<ArgIndex<3>>]>;
		1557
		1558	// Deprecated: use llvm.amdgcn.live.mask instead.
		1559	def int_amdgcn_ps_live : DefaultAttrsIntrinsic <
		1560	[llvm_i1_ty],
		1561	[],
		1562	[IntrNoMem]>;
		1563
		1564	// Query currently live lanes.
		1565	// Returns true if lane is live (and not a helper lane).
		1566	def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty],
		1567	[], [IntrReadMem, IntrInaccessibleMemOnly]
		1568	>;
		1569
		1570	def int_amdgcn_mbcnt_lo :
		1571	ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
		1572	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
		1573	[IntrNoMem]>;
		1574
		1575	def int_amdgcn_mbcnt_hi :
		1576	ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
		1577	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
		1578	[IntrNoMem]>;
		1579
		1580	// llvm.amdgcn.ds.swizzle src offset
		1581	def int_amdgcn_ds_swizzle :
		1582	ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
		1583	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
		1584	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree,
		1585	ImmArg<ArgIndex<1>>]>;
		1586
		1587	def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
		1588	[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
		1589	[IntrNoMem, IntrSpeculatable]
		1590	>;
		1591
		1592	def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
		1593	[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
		1594	[IntrNoMem, IntrSpeculatable]
		1595	>;
		1596
		1597	def int_amdgcn_lerp :
		1598	ClangBuiltin<"__builtin_amdgcn_lerp">,
		1599	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1600	[IntrNoMem, IntrSpeculatable]
		1601	>;
		1602
		1603	def int_amdgcn_sad_u8 :
		1604	ClangBuiltin<"__builtin_amdgcn_sad_u8">,
		1605	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1606	[IntrNoMem, IntrSpeculatable]
		1607	>;
		1608
		1609	def int_amdgcn_msad_u8 :
		1610	ClangBuiltin<"__builtin_amdgcn_msad_u8">,
		1611	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1612	[IntrNoMem, IntrSpeculatable]
		1613	>;
		1614
		1615	def int_amdgcn_sad_hi_u8 :
		1616	ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
		1617	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1618	[IntrNoMem, IntrSpeculatable]
		1619	>;
		1620
		1621	def int_amdgcn_sad_u16 :
		1622	ClangBuiltin<"__builtin_amdgcn_sad_u16">,
		1623	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1624	[IntrNoMem, IntrSpeculatable]
		1625	>;
		1626
		1627	def int_amdgcn_qsad_pk_u16_u8 :
		1628	ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
		1629	DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
		1630	[IntrNoMem, IntrSpeculatable]
		1631	>;
		1632
		1633	def int_amdgcn_mqsad_pk_u16_u8 :
		1634	ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
		1635	DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
		1636	[IntrNoMem, IntrSpeculatable]
		1637	>;
		1638
		1639	def int_amdgcn_mqsad_u32_u8 :
		1640	ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
		1641	DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
		1642	[IntrNoMem, IntrSpeculatable]
		1643	>;
		1644
		1645	def int_amdgcn_cvt_pk_u8_f32 :
		1646	ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
		1647	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
		1648	[IntrNoMem, IntrSpeculatable]
		1649	>;
		1650
		1651	def int_amdgcn_icmp :
		1652	Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
		1653	[IntrNoMem, IntrConvergent,
		1654	ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1655
		1656	def int_amdgcn_fcmp :
		1657	Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
		1658	[IntrNoMem, IntrConvergent,
		1659	ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1660
		1661	def int_amdgcn_ballot :
		1662	Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
		1663	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1664
		1665	def int_amdgcn_readfirstlane :
		1666	ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
		1667	Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
		1668	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1669
		1670	// The lane argument must be uniform across the currently active threads of the
		1671	// current wave. Otherwise, the result is undefined.
		1672	def int_amdgcn_readlane :
		1673	ClangBuiltin<"__builtin_amdgcn_readlane">,
		1674	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
		1675	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1676
		1677	// The value to write and lane select arguments must be uniform across the
		1678	// currently active threads of the current wave. Otherwise, the result is
		1679	// undefined.
		1680	def int_amdgcn_writelane :
		1681	ClangBuiltin<"__builtin_amdgcn_writelane">,
		1682	Intrinsic<[llvm_i32_ty], [
		1683	llvm_i32_ty, // uniform value to write: returned by the selected lane
		1684	llvm_i32_ty, // uniform lane select
		1685	llvm_i32_ty // returned by all lanes other than the selected one
		1686	],
		1687	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1688	>;
		1689
		1690	def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
		1691	DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1692	[IntrNoMem, IntrSpeculatable]
		1693	>;
		1694
		1695	def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
		1696	[llvm_i32_ty, llvm_i32_ty],
		1697	[IntrNoMem, IntrSpeculatable]
		1698	>;
		1699
		1700	def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
		1701	[llvm_i32_ty, llvm_i32_ty],
		1702	[IntrNoMem, IntrSpeculatable]
		1703	>;
		1704
		1705	def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
		1706	[llvm_i32_ty, llvm_i32_ty],
		1707	[IntrNoMem, IntrSpeculatable]
		1708	>;
		1709
		1710	def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
		1711	[llvm_i32_ty, llvm_i32_ty],
		1712	[IntrNoMem, IntrSpeculatable]
		1713	>;
		1714
		1715	// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
		1716	//
		1717	// bar_val is the total number of waves that will wait on this
		1718	// barrier, minus 1.
		1719	def int_amdgcn_ds_gws_init :
		1720	ClangBuiltin<"__builtin_amdgcn_ds_gws_init">,
		1721	Intrinsic<[],
		1722	[llvm_i32_ty, llvm_i32_ty],
		1723	[IntrConvergent, IntrWriteMem,
		1724	IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
		1725	[SDNPMemOperand]
		1726	>;
		1727
		1728	// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
		1729	// bar_val is the total number of waves that will wait on this
		1730	// barrier, minus 1.
		1731	def int_amdgcn_ds_gws_barrier :
		1732	ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
		1733	Intrinsic<[],
		1734	[llvm_i32_ty, llvm_i32_ty],
		1735	[IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
		1736	[SDNPMemOperand]
		1737	>;
		1738
		1739	// llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
		1740	def int_amdgcn_ds_gws_sema_v :
		1741	ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
		1742	Intrinsic<[],
		1743	[llvm_i32_ty],
		1744	[IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
		1745	[SDNPMemOperand]
		1746	>;
		1747
		1748	// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
		1749	def int_amdgcn_ds_gws_sema_br :
		1750	ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
		1751	Intrinsic<[],
		1752	[llvm_i32_ty, llvm_i32_ty],
		1753	[IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
		1754	[SDNPMemOperand]
		1755	>;
		1756
		1757	// llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
		1758	def int_amdgcn_ds_gws_sema_p :
		1759	ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
		1760	Intrinsic<[],
		1761	[llvm_i32_ty],
		1762	[IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
		1763	[SDNPMemOperand]
		1764	>;
		1765
		1766	// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
		1767	def int_amdgcn_ds_gws_sema_release_all :
		1768	ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
		1769	Intrinsic<[],
		1770	[llvm_i32_ty],
		1771	[IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
		1772	[SDNPMemOperand]
		1773	>;
		1774
		1775
		1776	// Copies the source value to the destination value, with the guarantee that
		1777	// the source value is computed as if the entire program were executed in WQM.
		1778	def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
		1779	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1780	>;
		1781
		1782	// Copies the source value to the destination value, such that the source
		1783	// is computed as if the entire program were executed in WQM if any other
		1784	// program code executes in WQM.
		1785	def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
		1786	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1787	>;
		1788
		1789	// Return true if at least one thread within the pixel quad passes true into
		1790	// the function.
		1791	def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
		1792	[llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1793	>;
		1794
		1795	// If false, set EXEC=0 for the current thread until the end of program.
		1796	// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
		1797	def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>;
		1798
		1799	def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">,
		1800	Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrNoCallback, IntrNoFree]
		1801	>;
		1802
		1803	// If false, mark all active lanes as helper lanes until the end of program.
		1804	def int_amdgcn_wqm_demote : Intrinsic<[],
		1805	[llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree]
		1806	>;
		1807
		1808	// Copies the active channels of the source value to the destination value,
		1809	// with the guarantee that the source value is computed as if the entire
		1810	// program were executed in Whole Wavefront Mode, i.e. with all channels
		1811	// enabled, with a few exceptions: - Phi nodes which require WWM return an
		1812	// undefined value.
		1813	def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty],
		1814	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
		1815	IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1816	>;
		1817	// Deprecated. Use int_amdgcn_strict_wwm instead.
		1818	def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
		1819	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
		1820	IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1821	>;
		1822	def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
		1823	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
		1824	IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		1825	>;
		1826
		1827	// Given a value, copies it while setting all the inactive lanes to a given
		1828	// value. Note that OpenGL helper lanes are considered active, so if the
		1829	// program ever uses WQM, then the instruction and the first source will be
		1830	// computed in WQM.
		1831	def int_amdgcn_set_inactive :
		1832	Intrinsic<[llvm_anyint_ty],
		1833	[LLVMMatchType<0>, // value to be copied
		1834	LLVMMatchType<0>], // value for the inactive lanes to take
		1835	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1836
		1837	// Return if the given flat pointer points to a local memory address.
		1838	def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
		1839	DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
		1840	[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
		1841	>;
		1842
		1843	// Return if the given flat pointer points to a prvate memory address.
		1844	def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
		1845	DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
		1846	[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
		1847	>;
		1848
		1849	//===----------------------------------------------------------------------===//
		1850	// CI+ Intrinsics
		1851	//===----------------------------------------------------------------------===//
		1852
		1853	def int_amdgcn_s_dcache_inv_vol :
		1854	ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
		1855	DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
		1856
		1857	def int_amdgcn_buffer_wbinvl1_vol :
		1858	ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
		1859	DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
		1860
		1861	//===----------------------------------------------------------------------===//
		1862	// VI Intrinsics
		1863	//===----------------------------------------------------------------------===//
		1864
		1865	// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
		1866	def int_amdgcn_mov_dpp :
		1867	Intrinsic<[llvm_anyint_ty],
		1868	[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
		1869	llvm_i1_ty],
		1870	[IntrNoMem, IntrConvergent, IntrWillReturn,
		1871	ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
		1872	ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
		1873
		1874	// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
		1875	// Should be equivalent to:
		1876	// v_mov_b32 <dest> <old>
		1877	// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
		1878	def int_amdgcn_update_dpp :
		1879	Intrinsic<[llvm_anyint_ty],
		1880	[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
		1881	llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
		1882	[IntrNoMem, IntrConvergent, IntrWillReturn,
		1883	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
		1884	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
		1885
		1886	def int_amdgcn_s_dcache_wb :
		1887	ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">,
		1888	Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1889
		1890	def int_amdgcn_s_dcache_wb_vol :
		1891	ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
		1892	Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1893
		1894	def int_amdgcn_s_memrealtime :
		1895	ClangBuiltin<"__builtin_amdgcn_s_memrealtime">,
		1896	Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1897
		1898	// llvm.amdgcn.ds.permute <index> <src>
		1899	def int_amdgcn_ds_permute :
		1900	ClangBuiltin<"__builtin_amdgcn_ds_permute">,
		1901	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
		1902	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1903
		1904	// llvm.amdgcn.ds.bpermute <index> <src>
		1905	def int_amdgcn_ds_bpermute :
		1906	ClangBuiltin<"__builtin_amdgcn_ds_bpermute">,
		1907	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
		1908	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1909
		1910	// llvm.amdgcn.perm <src0> <src1> <selector>
		1911	def int_amdgcn_perm :
		1912	ClangBuiltin<"__builtin_amdgcn_perm">,
		1913	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		1914	[IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1915
		1916	//===----------------------------------------------------------------------===//
		1917	// GFX9 Intrinsics
		1918	//===----------------------------------------------------------------------===//
		1919
		1920	class AMDGPUGlobalLoadLDS : Intrinsic <
		1921	[],
		1922	[LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from
		1923	LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to
		1924	llvm_i32_ty, // Data byte size: 1/2/4
		1925	llvm_i32_ty, // imm offset (applied to both global and LDS address)
		1926	llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
		1927	// bit 1 = slc/sc1,
		1928	// bit 2 = dlc on gfx10+))
		1929	// bit 4 = scc/nt on gfx90a+))
		1930	[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
		1931	ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
		1932	"", [SDNPMemOperand]>;
		1933	def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
		1934
		1935	//===----------------------------------------------------------------------===//
		1936	// GFX10 Intrinsics
		1937	//===----------------------------------------------------------------------===//
		1938
		1939	// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
		1940	def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
		1941	Intrinsic<[llvm_i32_ty],
		1942	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
		1943	[IntrNoMem, IntrConvergent, IntrWillReturn,
		1944	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
		1945
		1946	// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
		1947	def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
		1948	Intrinsic<[llvm_i32_ty],
		1949	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
		1950	[IntrNoMem, IntrConvergent, IntrWillReturn,
		1951	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
		1952
		1953	// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
		1954	// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
		1955	// the lanes to read from.
		1956	def int_amdgcn_mov_dpp8 :
		1957	Intrinsic<[llvm_anyint_ty],
		1958	[LLVMMatchType<0>, llvm_i32_ty],
		1959	[IntrNoMem, IntrConvergent, IntrWillReturn,
		1960	ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>;
		1961
		1962	def int_amdgcn_s_get_waveid_in_workgroup :
		1963	ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
		1964	Intrinsic<[llvm_i32_ty], [],
		1965	[IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1966
		1967	class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
		1968	[vt],
		1969	[llvm_anyptr_ty, // vaddr
		1970	vt], // vdata(VGPR)
		1971	[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
		1972	[SDNPMemOperand]>;
		1973
		1974	def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
		1975
		1976	// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
		1977	// <ray_dir>, <ray_inv_dir>, <texture_descr>
		1978	// <node_ptr> is i32 or i64.
		1979	// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
		1980	def int_amdgcn_image_bvh_intersect_ray :
		1981	DefaultAttrsIntrinsic<[llvm_v4i32_ty],
		1982	[llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
		1983	LLVMMatchType<1>, llvm_v4i32_ty],
		1984	[IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1985
		1986	//===----------------------------------------------------------------------===//
		1987	// GFX11 Intrinsics
		1988	//===----------------------------------------------------------------------===//
		1989
		1990	// llvm.amdgcn.permlane64 <src0>
		1991	def int_amdgcn_permlane64 :
		1992	ClangBuiltin<"__builtin_amdgcn_permlane64">,
		1993	Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
		1994	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		1995
		1996	def int_amdgcn_ds_add_gs_reg_rtn :
		1997	ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
		1998	Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
		1999	[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		2000
		2001	def int_amdgcn_ds_sub_gs_reg_rtn :
		2002	ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
		2003	Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
		2004	[ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		2005
		2006	def int_amdgcn_ds_bvh_stack_rtn :
		2007	Intrinsic<
		2008	[llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
		2009	[
		2010	llvm_i32_ty, // %addr
		2011	llvm_i32_ty, // %data0
		2012	llvm_v4i32_ty, // %data1
		2013	llvm_i32_ty, // %offset
		2014	],
		2015	[ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2016	>;
		2017
		2018	// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
		2019	//
		2020	// These operations perform a matrix multiplication and accumulation of
		2021	// the form: D = A * B + C .
		2022
		2023	class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
		2024	Intrinsic<
		2025	[CD], // %D
		2026	[
		2027	AB, // %A
		2028	AB, // %B
		2029	LLVMMatchType<0>, // %C
		2030	],
		2031	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2032	>;
		2033
		2034	class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
		2035	Intrinsic<
		2036	[CD], // %D
		2037	[
		2038	AB, // %A
		2039	AB, // %B
		2040	LLVMMatchType<0>, // %C
		2041	llvm_i1_ty, // %high
		2042	],
		2043	[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2044	>;
		2045
		2046	class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
		2047	Intrinsic<
		2048	[CD], // %D
		2049	[
		2050	llvm_i1_ty, // %A_sign
		2051	AB, // %A
		2052	llvm_i1_ty, // %B_sign
		2053	AB, // %B
		2054	LLVMMatchType<0>, // %C
		2055	llvm_i1_ty, // %clamp
		2056	],
		2057	[IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2058	>;
		2059
		2060	def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
		2061	def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
		2062	def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
		2063	def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
		2064	def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
		2065	def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
		2066
		2067	def int_amdgcn_s_wait_event_export_ready :
		2068	ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
		2069	Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
		2070	>;
		2071
		2072	//===----------------------------------------------------------------------===//
		2073	// Deep learning intrinsics.
		2074	//===----------------------------------------------------------------------===//
		2075
		2076	// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
		2077	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
		2078	def int_amdgcn_fdot2 :
		2079	ClangBuiltin<"__builtin_amdgcn_fdot2">,
		2080	DefaultAttrsIntrinsic<
		2081	[llvm_float_ty], // %r
		2082	[
		2083	llvm_v2f16_ty, // %a
		2084	llvm_v2f16_ty, // %b
		2085	llvm_float_ty, // %c
		2086	llvm_i1_ty // %clamp
		2087	],
		2088	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2089	>;
		2090
		2091	// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
		2092	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
		2093	def int_amdgcn_fdot2_f16_f16 :
		2094	ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
		2095	DefaultAttrsIntrinsic<
		2096	[llvm_half_ty], // %r
		2097	[
		2098	llvm_v2f16_ty, // %a
		2099	llvm_v2f16_ty, // %b
		2100	llvm_half_ty // %c
		2101	],
		2102	[IntrNoMem, IntrSpeculatable]
		2103	>;
		2104
		2105	// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
		2106	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
		2107	def int_amdgcn_fdot2_bf16_bf16 :
		2108	ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
		2109	DefaultAttrsIntrinsic<
		2110	[llvm_i16_ty], // %r
		2111	[
		2112	llvm_v2i16_ty, // %a
		2113	llvm_v2i16_ty, // %b
		2114	llvm_i16_ty // %c
		2115	],
		2116	[IntrNoMem, IntrSpeculatable]
		2117	>;
		2118
		2119	// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
		2120	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
		2121	def int_amdgcn_fdot2_f32_bf16 :
		2122	ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
		2123	DefaultAttrsIntrinsic<
		2124	[llvm_float_ty], // %r
		2125	[
		2126	llvm_v2i16_ty, // %a
		2127	llvm_v2i16_ty, // %b
		2128	llvm_float_ty, // %c
		2129	llvm_i1_ty // %clamp
		2130	],
		2131	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2132	>;
		2133
		2134	// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
		2135	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
		2136	def int_amdgcn_sdot2 :
		2137	ClangBuiltin<"__builtin_amdgcn_sdot2">,
		2138	DefaultAttrsIntrinsic<
		2139	[llvm_i32_ty], // %r
		2140	[
		2141	llvm_v2i16_ty, // %a
		2142	llvm_v2i16_ty, // %b
		2143	llvm_i32_ty, // %c
		2144	llvm_i1_ty // %clamp
		2145	],
		2146	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2147	>;
		2148
		2149	// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
		2150	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
		2151	def int_amdgcn_udot2 :
		2152	ClangBuiltin<"__builtin_amdgcn_udot2">,
		2153	DefaultAttrsIntrinsic<
		2154	[llvm_i32_ty], // %r
		2155	[
		2156	llvm_v2i16_ty, // %a
		2157	llvm_v2i16_ty, // %b
		2158	llvm_i32_ty, // %c
		2159	llvm_i1_ty // %clamp
		2160	],
		2161	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2162	>;
		2163
		2164	// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
		2165	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
		2166	def int_amdgcn_sdot4 :
		2167	ClangBuiltin<"__builtin_amdgcn_sdot4">,
		2168	DefaultAttrsIntrinsic<
		2169	[llvm_i32_ty], // %r
		2170	[
		2171	llvm_i32_ty, // %a
		2172	llvm_i32_ty, // %b
		2173	llvm_i32_ty, // %c
		2174	llvm_i1_ty // %clamp
		2175	],
		2176	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2177	>;
		2178
		2179	// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
		2180	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
		2181	def int_amdgcn_udot4 :
		2182	ClangBuiltin<"__builtin_amdgcn_udot4">,
		2183	DefaultAttrsIntrinsic<
		2184	[llvm_i32_ty], // %r
		2185	[
		2186	llvm_i32_ty, // %a
		2187	llvm_i32_ty, // %b
		2188	llvm_i32_ty, // %c
		2189	llvm_i1_ty // %clamp
		2190	],
		2191	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2192	>;
		2193
		2194	// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
		2195	// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
		2196	// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
		2197	// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
		2198	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
		2199	def int_amdgcn_sudot4 :
		2200	ClangBuiltin<"__builtin_amdgcn_sudot4">,
		2201	DefaultAttrsIntrinsic<
		2202	[llvm_i32_ty], // %r
		2203	[
		2204	llvm_i1_ty, // %a_sign
		2205	llvm_i32_ty, // %a
		2206	llvm_i1_ty, // %b_sign
		2207	llvm_i32_ty, // %b
		2208	llvm_i32_ty, // %c
		2209	llvm_i1_ty // %clamp
		2210	],
		2211	[IntrNoMem, IntrSpeculatable,
		2212	ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
		2213	>;
		2214
		2215	// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
		2216	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
		2217	// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
		2218	def int_amdgcn_sdot8 :
		2219	ClangBuiltin<"__builtin_amdgcn_sdot8">,
		2220	DefaultAttrsIntrinsic<
		2221	[llvm_i32_ty], // %r
		2222	[
		2223	llvm_i32_ty, // %a
		2224	llvm_i32_ty, // %b
		2225	llvm_i32_ty, // %c
		2226	llvm_i1_ty // %clamp
		2227	],
		2228	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2229	>;
		2230
		2231	// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
		2232	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
		2233	// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
		2234	def int_amdgcn_udot8 :
		2235	ClangBuiltin<"__builtin_amdgcn_udot8">,
		2236	DefaultAttrsIntrinsic<
		2237	[llvm_i32_ty], // %r
		2238	[
		2239	llvm_i32_ty, // %a
		2240	llvm_i32_ty, // %b
		2241	llvm_i32_ty, // %c
		2242	llvm_i1_ty // %clamp
		2243	],
		2244	[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
		2245	>;
		2246
		2247	// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
		2248	// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
		2249	// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
		2250	// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
		2251	// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
		2252	// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
		2253	def int_amdgcn_sudot8 :
		2254	ClangBuiltin<"__builtin_amdgcn_sudot8">,
		2255	DefaultAttrsIntrinsic<
		2256	[llvm_i32_ty], // %r
		2257	[
		2258	llvm_i1_ty, // %a_sign
		2259	llvm_i32_ty, // %a
		2260	llvm_i1_ty, // %b_sign
		2261	llvm_i32_ty, // %b
		2262	llvm_i32_ty, // %c
		2263	llvm_i1_ty // %clamp
		2264	],
		2265	[IntrNoMem, IntrSpeculatable,
		2266	ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
		2267	>;
		2268
		2269	//===----------------------------------------------------------------------===//
		2270	// gfx908 intrinsics
		2271	// ===----------------------------------------------------------------------===//
		2272
		2273	def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
		2274
		2275	// llvm.amdgcn.mfma.. vdst, srcA, srcB, srcC, cbsz, abid, blgp
		2276	class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
		2277	ClangBuiltin<!subst("int", "__builtin", NAME)>,
		2278	DefaultAttrsIntrinsic<[DestTy],
		2279	[SrcABTy, SrcABTy, DestTy,
		2280	llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		2281	[IntrConvergent, IntrNoMem,
		2282	ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
		2283
		2284	def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
		2285	def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
		2286	def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>;
		2287	def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
		2288	def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>;
		2289	def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>;
		2290	def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
		2291	def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>;
		2292	def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
		2293	def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>;
		2294	def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>;
		2295	def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
		2296	def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>;
		2297	def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
		2298	def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>;
		2299	def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
		2300	def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
		2301	def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
		2302	def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
		2303	def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
		2304
		2305	//===----------------------------------------------------------------------===//
		2306	// gfx90a intrinsics
		2307	// ===----------------------------------------------------------------------===//
		2308
		2309	def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
		2310	def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
		2311	def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
		2312	def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
		2313	def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
		2314
		2315	def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
		2316	def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
		2317	def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
		2318	def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
		2319	def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
		2320
		2321	// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
		2322	// Three bits corresponding to the neg modifier applied to the respective
		2323	// source operand.
		2324	def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
		2325	def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
		2326
		2327	//===----------------------------------------------------------------------===//
		2328	// gfx940 intrinsics
		2329	// ===----------------------------------------------------------------------===//
		2330
		2331	// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
		2332	def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
		2333	def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
		2334	def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
		2335	[llvm_v2i16_ty],
		2336	[LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty],
		2337	[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
		2338	ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
		2339
		2340	def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
		2341	def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
		2342	def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
		2343	def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
		2344
		2345	class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
		2346	AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
		2347
		2348	multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
		2349	foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
		2350	def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
		2351	}
		2352
		2353	defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
		2354	defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
		2355
		2356	// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
		2357	class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
		2358	ClangBuiltin<!subst("int", "__builtin", NAME)>,
		2359	DefaultAttrsIntrinsic<[DestTy],
		2360	[SrcA, SrcB, DestTy, llvm_i32_ty,
		2361	llvm_i32_ty, llvm_i32_ty],
		2362	[IntrConvergent, IntrNoMem,
		2363	ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
		2364
		2365	def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
		2366	def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
		2367	def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
		2368	def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
		2369	def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
		2370	def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
		2371
		2372	class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
		2373	AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
		2374
		2375	multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
		2376	foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
		2377	def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
		2378	}
		2379
		2380	defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
		2381	defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
		2382
		2383	// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
		2384	// byte_sel selects byte from srcA.
		2385	def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
		2386	DefaultAttrsIntrinsic<[llvm_float_ty],
		2387	[llvm_i32_ty, llvm_i32_ty],
		2388	[IntrNoMem, ImmArg<ArgIndex<1>>]>;
		2389
		2390	// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3]
		2391	def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
		2392	DefaultAttrsIntrinsic<[llvm_float_ty],
		2393	[llvm_i32_ty, llvm_i32_ty],
		2394	[IntrNoMem, ImmArg<ArgIndex<1>>]>;
		2395
		2396	// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
		2397	// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
		2398	def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
		2399	DefaultAttrsIntrinsic<[llvm_v2f32_ty],
		2400	[llvm_i32_ty, llvm_i1_ty],
		2401	[IntrNoMem, ImmArg<ArgIndex<1>>]>;
		2402
		2403	// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
		2404	def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
		2405	DefaultAttrsIntrinsic<[llvm_v2f32_ty],
		2406	[llvm_i32_ty, llvm_i1_ty],
		2407	[IntrNoMem, ImmArg<ArgIndex<1>>]>;
		2408
		2409	// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
		2410	// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
		2411	def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
		2412	DefaultAttrsIntrinsic<[llvm_i32_ty],
		2413	[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
		2414	[IntrNoMem, ImmArg<ArgIndex<3>>]>;
		2415
		2416	// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
		2417	def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
		2418	DefaultAttrsIntrinsic<[llvm_i32_ty],
		2419	[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
		2420	[IntrNoMem, ImmArg<ArgIndex<3>>]>;
		2421
		2422	// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
		2423	// byte_sel selects byte to write into vdst.
		2424	def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
		2425	DefaultAttrsIntrinsic<[llvm_i32_ty],
		2426	[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		2427	[IntrNoMem, ImmArg<ArgIndex<3>>]>;
		2428
		2429	// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
		2430	def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
		2431	DefaultAttrsIntrinsic<[llvm_i32_ty],
		2432	[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
		2433	[IntrNoMem, ImmArg<ArgIndex<3>>]>;
		2434
		2435	// Represent a relocation constant.
		2436	def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic<
		2437	[llvm_i32_ty], [llvm_metadata_ty],
		2438	[IntrNoMem, IntrSpeculatable]
		2439	>;
		2440
		2441	//===----------------------------------------------------------------------===//
		2442	// Special Intrinsics for backend internal use only. No frontend
		2443	// should emit calls to these.
		2444	// ===----------------------------------------------------------------------===//
		2445	def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
		2446	[llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2447	>;
		2448
		2449	def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
		2450	[llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2451	>;
		2452
		2453	def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
		2454	[llvm_i1_ty, LLVMMatchType<0>],
		2455	[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2456	>;
		2457
		2458	def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
		2459	[llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
		2460	>;
		2461
		2462	def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
		2463	[IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
		2464
		2465	// Represent unreachable in a divergent region.
		2466	def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;
		2467
		2468	// Emit 2.5 ulp, no denormal division. Should only be inserted by
		2469	// pass based on !fpmath metadata.
		2470	def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
		2471	[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
		2472	[IntrNoMem, IntrSpeculatable]
		2473	>;
		2474	}

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/include/llvm/IR/IntrinsicsAMDGPU.td – Rev 14