Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | // |
||
9 | // This file defines all of the R600-specific intrinsics. |
||
10 | // |
||
11 | //===----------------------------------------------------------------------===// |
||
12 | |||
13 | class AMDGPUReadPreloadRegisterIntrinsic |
||
14 | : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
15 | |||
16 | class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> |
||
17 | : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; |
||
18 | |||
19 | // Used to tag image and resource intrinsics with information used to generate |
||
20 | // mem operands. |
||
21 | class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { |
||
22 | int RsrcArg = rsrcarg; |
||
23 | bit IsImage = isimage; |
||
24 | } |
||
25 | |||
26 | let TargetPrefix = "r600" in { |
||
27 | |||
28 | multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { |
||
29 | def _x : AMDGPUReadPreloadRegisterIntrinsic; |
||
30 | def _y : AMDGPUReadPreloadRegisterIntrinsic; |
||
31 | def _z : AMDGPUReadPreloadRegisterIntrinsic; |
||
32 | } |
||
33 | |||
34 | multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { |
||
35 | def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; |
||
36 | def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; |
||
37 | def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; |
||
38 | } |
||
39 | |||
40 | defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
41 | <"__builtin_r600_read_global_size">; |
||
42 | defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
43 | <"__builtin_r600_read_ngroups">; |
||
44 | defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
45 | <"__builtin_r600_read_tgid">; |
||
46 | |||
47 | defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; |
||
48 | defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; |
||
49 | |||
50 | def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, |
||
51 | Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; |
||
52 | |||
53 | // AS 7 is PARAM_I_ADDRESS, used for kernel arguments |
||
54 | def int_r600_implicitarg_ptr : |
||
55 | ClangBuiltin<"__builtin_r600_implicitarg_ptr">, |
||
56 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [], |
||
57 | [IntrNoMem, IntrSpeculatable]>; |
||
58 | |||
59 | def int_r600_rat_store_typed : |
||
60 | // 1st parameter: Data |
||
61 | // 2nd parameter: Index |
||
62 | // 3rd parameter: Constant RAT ID |
||
63 | DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, |
||
64 | ClangBuiltin<"__builtin_r600_rat_store_typed">; |
||
65 | |||
66 | def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< |
||
67 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
68 | >; |
||
69 | |||
70 | def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< |
||
71 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
72 | >; |
||
73 | |||
74 | def int_r600_cube : DefaultAttrsIntrinsic< |
||
75 | [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] |
||
76 | >; |
||
77 | |||
78 | def int_r600_store_stream_output : DefaultAttrsIntrinsic< |
||
79 | [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] |
||
80 | >; |
||
81 | |||
82 | class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ |
||
83 | llvm_v4f32_ty, // Coord |
||
84 | llvm_i32_ty, // offset_x |
||
85 | llvm_i32_ty, // offset_y, |
||
86 | llvm_i32_ty, // offset_z, |
||
87 | llvm_i32_ty, // resource_id |
||
88 | llvm_i32_ty, // samplerid |
||
89 | llvm_i32_ty, // coord_type_x |
||
90 | llvm_i32_ty, // coord_type_y |
||
91 | llvm_i32_ty, // coord_type_z |
||
92 | llvm_i32_ty], // coord_type_w |
||
93 | [IntrNoMem] |
||
94 | >; |
||
95 | |||
96 | class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ |
||
97 | llvm_v4i32_ty, // Coord |
||
98 | llvm_i32_ty, // offset_x |
||
99 | llvm_i32_ty, // offset_y, |
||
100 | llvm_i32_ty, // offset_z, |
||
101 | llvm_i32_ty, // resource_id |
||
102 | llvm_i32_ty, // samplerid |
||
103 | llvm_i32_ty, // coord_type_x |
||
104 | llvm_i32_ty, // coord_type_y |
||
105 | llvm_i32_ty, // coord_type_z |
||
106 | llvm_i32_ty], // coord_type_w |
||
107 | [IntrNoMem] |
||
108 | >; |
||
109 | |||
110 | def int_r600_store_swizzle : |
||
111 | Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
112 | >; |
||
113 | |||
114 | def int_r600_tex : TextureIntrinsicFloatInput; |
||
115 | def int_r600_texc : TextureIntrinsicFloatInput; |
||
116 | def int_r600_txl : TextureIntrinsicFloatInput; |
||
117 | def int_r600_txlc : TextureIntrinsicFloatInput; |
||
118 | def int_r600_txb : TextureIntrinsicFloatInput; |
||
119 | def int_r600_txbc : TextureIntrinsicFloatInput; |
||
120 | def int_r600_txf : TextureIntrinsicInt32Input; |
||
121 | def int_r600_txq : TextureIntrinsicInt32Input; |
||
122 | def int_r600_ddx : TextureIntrinsicFloatInput; |
||
123 | def int_r600_ddy : TextureIntrinsicFloatInput; |
||
124 | |||
125 | def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], |
||
126 | [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] |
||
127 | >; |
||
128 | |||
129 | def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; |
||
130 | |||
131 | } // End TargetPrefix = "r600" |
||
132 | |||
133 | let TargetPrefix = "amdgcn" in { |
||
134 | |||
135 | //===----------------------------------------------------------------------===// |
||
136 | // ABI Special Intrinsics |
||
137 | //===----------------------------------------------------------------------===// |
||
138 | |||
139 | defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; |
||
140 | defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
141 | <"__builtin_amdgcn_workgroup_id">; |
||
142 | |||
143 | def int_amdgcn_dispatch_ptr : |
||
144 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
145 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
146 | |||
147 | def int_amdgcn_queue_ptr : |
||
148 | ClangBuiltin<"__builtin_amdgcn_queue_ptr">, |
||
149 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
150 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
151 | |||
152 | def int_amdgcn_kernarg_segment_ptr : |
||
153 | ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, |
||
154 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
155 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
156 | |||
157 | def int_amdgcn_implicitarg_ptr : |
||
158 | ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, |
||
159 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
160 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
161 | |||
162 | def int_amdgcn_groupstaticsize : |
||
163 | ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, |
||
164 | DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
165 | |||
166 | def int_amdgcn_dispatch_id : |
||
167 | ClangBuiltin<"__builtin_amdgcn_dispatch_id">, |
||
168 | DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
169 | |||
170 | // For internal use. Coordinates LDS lowering between IR transform and backend. |
||
171 | def int_amdgcn_lds_kernel_id : |
||
172 | DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
173 | |||
174 | def int_amdgcn_implicit_buffer_ptr : |
||
175 | ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, |
||
176 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
177 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
178 | |||
179 | // Set EXEC to the 64-bit value given. |
||
180 | // This is always moved to the beginning of the basic block. |
||
181 | // FIXME: Should be mangled for wave size. |
||
182 | def int_amdgcn_init_exec : Intrinsic<[], |
||
183 | [llvm_i64_ty], // 64-bit literal constant |
||
184 | [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, |
||
185 | IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; |
||
186 | |||
187 | // Set EXEC according to a thread count packed in an SGPR input: |
||
188 | // thread_count = (input >> bitoffset) & 0x7f; |
||
189 | // This is always moved to the beginning of the basic block. |
||
190 | // Note: only inreg arguments to the parent function are valid as |
||
191 | // inputs to this intrinsic, computed values cannot be used. |
||
192 | def int_amdgcn_init_exec_from_input : Intrinsic<[], |
||
193 | [llvm_i32_ty, // 32-bit SGPR input |
||
194 | llvm_i32_ty], // bit offset of the thread count |
||
195 | [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, |
||
196 | IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; |
||
197 | |||
198 | def int_amdgcn_wavefrontsize : |
||
199 | ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, |
||
200 | DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
201 | |||
202 | |||
203 | //===----------------------------------------------------------------------===// |
||
204 | // Instruction Intrinsics |
||
205 | //===----------------------------------------------------------------------===// |
||
206 | |||
207 | // The first parameter is s_sendmsg immediate (i16), |
||
208 | // the second one is copied to m0 |
||
209 | def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, |
||
210 | Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], |
||
211 | [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; |
||
212 | def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, |
||
213 | Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], |
||
214 | [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; |
||
215 | |||
216 | |||
217 | // gfx11 intrinsic |
||
218 | // The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. |
||
219 | def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], |
||
220 | [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; |
||
221 | |||
222 | def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, |
||
223 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
224 | |||
225 | def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, |
||
226 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
227 | |||
228 | // The 1st parameter is a mask for the types of instructions that may be allowed |
||
229 | // to cross the SCHED_BARRIER during scheduling. |
||
230 | // MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. |
||
231 | // MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be |
||
232 | // scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. |
||
233 | // MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. |
||
234 | // MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. |
||
235 | // MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. |
||
236 | // MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. |
||
237 | // MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. |
||
238 | // MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. |
||
239 | // MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. |
||
240 | // MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. |
||
241 | // MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. |
||
242 | def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, |
||
243 | Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, |
||
244 | IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
245 | |||
246 | // The first parameter is a mask that determines the types of instructions that |
||
247 | // you would like to synchronize around and add to a scheduling group. The |
||
248 | // values of the mask are defined above for sched_barrier. These instructions |
||
249 | // will be selected from the bottom up starting from the sched_group_barrier's |
||
250 | // location during instruction scheduling. The second parameter is the number of |
||
251 | // matching instructions that will be associated with this sched_group_barrier. |
||
252 | // The third parameter is an identifier which is used to describe what other |
||
253 | // sched_group_barriers should be synchronized with. |
||
254 | def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, |
||
255 | Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
256 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, |
||
257 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
258 | |||
259 | // Scheduler optimization hint. |
||
260 | // MASK = 0: Small gemm opt |
||
261 | def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, |
||
262 | Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, |
||
263 | IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
264 | |||
265 | def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, |
||
266 | Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
267 | |||
268 | def int_amdgcn_div_scale : DefaultAttrsIntrinsic< |
||
269 | // 1st parameter: Numerator |
||
270 | // 2nd parameter: Denominator |
||
271 | // 3rd parameter: Select quotient. Must equal Numerator or Denominator. |
||
272 | // (0 = Denominator, 1 = Numerator). |
||
273 | [llvm_anyfloat_ty, llvm_i1_ty], |
||
274 | [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], |
||
275 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] |
||
276 | >; |
||
277 | |||
278 | def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
279 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], |
||
280 | [IntrNoMem, IntrSpeculatable] |
||
281 | >; |
||
282 | |||
283 | def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
284 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], |
||
285 | [IntrNoMem, IntrSpeculatable] |
||
286 | >; |
||
287 | |||
288 | // Look Up 2.0 / pi src0 with segment select src1[4:0] |
||
289 | def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< |
||
290 | [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], |
||
291 | [IntrNoMem, IntrSpeculatable] |
||
292 | >; |
||
293 | |||
294 | def int_amdgcn_sin : DefaultAttrsIntrinsic< |
||
295 | [llvm_anyfloat_ty], [LLVMMatchType<0>], |
||
296 | [IntrNoMem, IntrSpeculatable] |
||
297 | >; |
||
298 | |||
299 | def int_amdgcn_cos : DefaultAttrsIntrinsic< |
||
300 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
301 | >; |
||
302 | |||
303 | def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< |
||
304 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
305 | >; |
||
306 | |||
307 | def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, |
||
308 | DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], |
||
309 | [IntrNoMem, IntrSpeculatable, Commutative] |
||
310 | >; |
||
311 | |||
312 | // Fused single-precision multiply-add with legacy behaviour for the multiply, |
||
313 | // which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is |
||
314 | // intended for use on subtargets that have the v_fma_legacy_f32 and/or |
||
315 | // v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and |
||
316 | // has a completely different kind of legacy behaviour.) |
||
317 | def int_amdgcn_fma_legacy : |
||
318 | DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
319 | [IntrNoMem, IntrSpeculatable, Commutative] |
||
320 | >; |
||
321 | |||
322 | def int_amdgcn_rcp : DefaultAttrsIntrinsic< |
||
323 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
324 | >; |
||
325 | |||
326 | def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, |
||
327 | DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], |
||
328 | [IntrNoMem, IntrSpeculatable] |
||
329 | >; |
||
330 | |||
331 | def int_amdgcn_sqrt : DefaultAttrsIntrinsic< |
||
332 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
333 | >; |
||
334 | |||
335 | def int_amdgcn_rsq : DefaultAttrsIntrinsic< |
||
336 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
337 | >; |
||
338 | |||
339 | def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, |
||
340 | DefaultAttrsIntrinsic< |
||
341 | [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] |
||
342 | >; |
||
343 | |||
344 | // out = 1.0 / sqrt(a) result clamped to +/- max_float. |
||
345 | def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< |
||
346 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; |
||
347 | |||
348 | // For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used. |
||
349 | def int_amdgcn_ldexp : DefaultAttrsIntrinsic< |
||
350 | [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], |
||
351 | [IntrNoMem, IntrSpeculatable] |
||
352 | >; |
||
353 | |||
354 | def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< |
||
355 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
356 | >; |
||
357 | |||
358 | def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< |
||
359 | [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] |
||
360 | >; |
||
361 | |||
362 | // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 |
||
363 | // and always uses rtz, so is not suitable for implementing the OpenCL |
||
364 | // fract function. It should be ok on VI. |
||
365 | def int_amdgcn_fract : DefaultAttrsIntrinsic< |
||
366 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
367 | >; |
||
368 | |||
369 | def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, |
||
370 | DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], |
||
371 | [IntrNoMem, IntrSpeculatable] |
||
372 | >; |
||
373 | |||
374 | def int_amdgcn_cvt_pknorm_i16 : |
||
375 | ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, |
||
376 | DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], |
||
377 | [IntrNoMem, IntrSpeculatable] |
||
378 | >; |
||
379 | |||
380 | def int_amdgcn_cvt_pknorm_u16 : |
||
381 | ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, |
||
382 | DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], |
||
383 | [IntrNoMem, IntrSpeculatable] |
||
384 | >; |
||
385 | |||
386 | def int_amdgcn_cvt_pk_i16 : |
||
387 | ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, |
||
388 | DefaultAttrsIntrinsic< |
||
389 | [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], |
||
390 | [IntrNoMem, IntrSpeculatable] |
||
391 | >; |
||
392 | |||
393 | def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, |
||
394 | DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], |
||
395 | [IntrNoMem, IntrSpeculatable] |
||
396 | >; |
||
397 | |||
398 | def int_amdgcn_class : DefaultAttrsIntrinsic< |
||
399 | [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], |
||
400 | [IntrNoMem, IntrSpeculatable] |
||
401 | >; |
||
402 | |||
403 | def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">, |
||
404 | DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
405 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], |
||
406 | [IntrNoMem, IntrSpeculatable] |
||
407 | >; |
||
408 | |||
409 | def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, |
||
410 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
411 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
412 | [IntrNoMem, IntrSpeculatable] |
||
413 | >; |
||
414 | |||
415 | def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, |
||
416 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
417 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
418 | [IntrNoMem, IntrSpeculatable] |
||
419 | >; |
||
420 | |||
421 | def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, |
||
422 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
423 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
424 | [IntrNoMem, IntrSpeculatable] |
||
425 | >; |
||
426 | |||
427 | def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, |
||
428 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
429 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
430 | [IntrNoMem, IntrSpeculatable] |
||
431 | >; |
||
432 | |||
433 | // v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz |
||
434 | // should be used. |
||
435 | def int_amdgcn_sffbh : |
||
436 | DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], |
||
437 | [IntrNoMem, IntrSpeculatable] |
||
438 | >; |
||
439 | |||
440 | // v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. |
||
441 | def int_amdgcn_fmad_ftz : |
||
442 | DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
443 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], |
||
444 | [IntrNoMem, IntrSpeculatable] |
||
445 | >; |
||
446 | |||
447 | // Fields should mirror atomicrmw |
||
448 | class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], |
||
449 | [llvm_anyptr_ty, |
||
450 | LLVMMatchType<0>, |
||
451 | llvm_i32_ty, // ordering |
||
452 | llvm_i32_ty, // scope |
||
453 | llvm_i1_ty], // isVolatile |
||
454 | [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, |
||
455 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], "", |
||
456 | [SDNPMemOperand] |
||
457 | >; |
||
458 | |||
459 | def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; |
||
460 | def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; |
||
461 | |||
462 | class AMDGPULDSIntrin : |
||
463 | Intrinsic<[llvm_any_ty], |
||
464 | [LLVMQualPointerType<LLVMMatchType<0>, 3>, |
||
465 | LLVMMatchType<0>, |
||
466 | llvm_i32_ty, // ordering |
||
467 | llvm_i32_ty, // scope |
||
468 | llvm_i1_ty], // isVolatile |
||
469 | [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, |
||
470 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] |
||
471 | >; |
||
472 | |||
473 | // FIXME: The m0 argument should be moved after the normal arguments |
||
474 | class AMDGPUDSOrderedIntrinsic : Intrinsic< |
||
475 | [llvm_i32_ty], |
||
476 | // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that |
||
477 | // the bit packing can be optimized at the IR level. |
||
478 | [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0) |
||
479 | llvm_i32_ty, // value to add or swap |
||
480 | llvm_i32_ty, // ordering |
||
481 | llvm_i32_ty, // scope |
||
482 | llvm_i1_ty, // isVolatile |
||
483 | llvm_i32_ty, // ordered count index (OA index), also added to the address |
||
484 | // gfx10: bits 24-27 indicate the number of active threads/dwords |
||
485 | llvm_i1_ty, // wave release, usually set to 1 |
||
486 | llvm_i1_ty], // wave done, set to 1 for the last ordered instruction |
||
487 | [IntrWillReturn, NoCapture<ArgIndex<0>>, |
||
488 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, |
||
489 | ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree |
||
490 | ] |
||
491 | >; |
||
492 | |||
493 | class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< |
||
494 | [llvm_i32_ty], |
||
495 | [llvm_anyptr_ty, // LDS or GDS ptr |
||
496 | llvm_i1_ty], // isVolatile |
||
497 | [IntrConvergent, IntrWillReturn, IntrArgMemOnly, |
||
498 | NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], |
||
499 | "", |
||
500 | [SDNPMemOperand] |
||
501 | >; |
||
502 | |||
503 | def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; |
||
504 | def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; |
||
505 | |||
506 | // The pointer argument is assumed to be dynamically uniform if a VGPR. |
||
507 | def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; |
||
508 | def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; |
||
509 | |||
510 | def int_amdgcn_ds_fadd : AMDGPULDSIntrin; |
||
511 | def int_amdgcn_ds_fmin : AMDGPULDSIntrin; |
||
512 | def int_amdgcn_ds_fmax : AMDGPULDSIntrin; |
||
513 | |||
514 | } // TargetPrefix = "amdgcn" |
||
515 | |||
516 | // New-style image intrinsics |
||
517 | |||
518 | ////////////////////////////////////////////////////////////////////////// |
||
519 | // Dimension-aware image intrinsics framework |
||
520 | ////////////////////////////////////////////////////////////////////////// |
||
521 | |||
522 | // Helper class to represent (type, name) combinations of arguments. The |
||
523 | // argument names are explanatory and used as DAG operand names for codegen |
||
524 | // pattern matching. |
||
525 | class AMDGPUArg<LLVMType ty, string name> { |
||
526 | LLVMType Type = ty; |
||
527 | string Name = name; |
||
528 | } |
||
529 | |||
530 | // Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] |
||
531 | class makeArgList<list<string> names, LLVMType basety> { |
||
532 | list<AMDGPUArg> ret = |
||
533 | !listconcat([AMDGPUArg<basety, names[0]>], |
||
534 | !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); |
||
535 | } |
||
536 | |||
537 | // Return arglist, with LLVMMatchType's references shifted by 'shift'. |
||
538 | class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { |
||
539 | list<AMDGPUArg> ret = |
||
540 | !foreach(arg, arglist, |
||
541 | !if(!isa<LLVMMatchType>(arg.Type), |
||
542 | AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, |
||
543 | arg.Name>, |
||
544 | arg)); |
||
545 | } |
||
546 | |||
547 | // Return the concatenation of the given arglists. LLVMMatchType's are adjusted |
||
548 | // accordingly, and shifted by an additional 'shift'. |
||
549 | class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { |
||
550 | list<AMDGPUArg> ret = |
||
551 | !foldl([]<AMDGPUArg>, arglists, lhs, rhs, |
||
552 | !listconcat( |
||
553 | lhs, |
||
554 | arglistmatchshift<rhs, |
||
555 | !add(shift, !foldl(0, lhs, a, b, |
||
556 | !add(a, b.Type.isAny)))>.ret)); |
||
557 | } |
||
558 | |||
559 | // Represent texture/image types / dimensionality. |
||
560 | class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, |
||
561 | list<string> coord_names, list<string> slice_names, |
||
562 | bit msaa = 0> { |
||
563 | AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); |
||
564 | string Name = name; // e.g. "2darraymsaa" |
||
565 | string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) |
||
566 | bits<3> Encoding = enc; |
||
567 | bit DA = 0; // DA bit in MIMG encoding |
||
568 | bit MSAA = msaa; |
||
569 | |||
570 | list<AMDGPUArg> CoordSliceArgs = |
||
571 | makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; |
||
572 | list<AMDGPUArg> CoordSliceIntArgs = |
||
573 | makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; |
||
574 | list<AMDGPUArg> GradientArgs = |
||
575 | makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), |
||
576 | !foreach(name, coord_names, "d" # name # "dv")), |
||
577 | llvm_anyfloat_ty>.ret; |
||
578 | |||
579 | bits<8> NumCoords = !size(CoordSliceArgs); |
||
580 | bits<8> NumGradients = !size(GradientArgs); |
||
581 | } |
||
582 | |||
583 | def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; |
||
584 | def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; |
||
585 | def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; |
||
586 | let DA = 1 in { |
||
587 | def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; |
||
588 | def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; |
||
589 | def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; |
||
590 | } |
||
591 | def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; |
||
592 | let DA = 1 in { |
||
593 | def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; |
||
594 | } |
||
595 | |||
596 | def AMDGPUDims { |
||
597 | list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, |
||
598 | AMDGPUDimCube, AMDGPUDim1DArray, |
||
599 | AMDGPUDim2DArray]; |
||
600 | list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; |
||
601 | list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); |
||
602 | } |
||
603 | |||
604 | // Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. |
||
605 | class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { |
||
606 | string UpperCaseMod = ucmod; |
||
607 | string LowerCaseMod = lcmod; |
||
608 | |||
609 | // {offset} {bias} {z-compare} |
||
610 | list<AMDGPUArg> ExtraAddrArgs = extra_addr; |
||
611 | bit Offset = false; |
||
612 | bit Bias = false; |
||
613 | bit ZCompare = false; |
||
614 | bit Gradients = false; |
||
615 | |||
616 | // Name of the {lod} or {clamp} argument that is appended to the coordinates, |
||
617 | // if any. |
||
618 | string LodOrClamp = ""; |
||
619 | } |
||
620 | |||
621 | // AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE |
||
622 | // AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 |
||
623 | defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { |
||
624 | multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, |
||
625 | list<AMDGPUArg> extra_addr> { |
||
626 | def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; |
||
627 | let Offset = true in |
||
628 | def NAME#lcmod#_o : AMDGPUSampleVariant< |
||
629 | ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; |
||
630 | } |
||
631 | |||
632 | multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, |
||
633 | list<AMDGPUArg> extra_addr> { |
||
634 | defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; |
||
635 | let ZCompare = true in |
||
636 | defm NAME : AMDGPUSampleHelper_Offset< |
||
637 | "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; |
||
638 | } |
||
639 | |||
640 | multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, |
||
641 | list<AMDGPUArg> extra_addr> { |
||
642 | defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; |
||
643 | let LodOrClamp = "clamp" in |
||
644 | defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; |
||
645 | } |
||
646 | |||
647 | defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { |
||
648 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; |
||
649 | let Bias = true in |
||
650 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp< |
||
651 | "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; |
||
652 | let LodOrClamp = "lod" in |
||
653 | defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; |
||
654 | defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; |
||
655 | } |
||
656 | |||
657 | let Gradients = true in { |
||
658 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; |
||
659 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; |
||
660 | } |
||
661 | } |
||
662 | |||
663 | // Helper class to capture the profile of a dimension-aware image intrinsic. |
||
664 | // This information is used to generate the intrinsic's type and to inform |
||
665 | // codegen pattern matching. |
||
666 | class AMDGPUDimProfile<string opmod, |
||
667 | AMDGPUDimProps dim> { |
||
668 | AMDGPUDimProps Dim = dim; |
||
669 | string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod |
||
670 | |||
671 | // These are intended to be overwritten by subclasses |
||
672 | bit IsSample = false; |
||
673 | bit IsAtomic = false; |
||
674 | list<LLVMType> RetTypes = []; |
||
675 | list<AMDGPUArg> DataArgs = []; |
||
676 | list<AMDGPUArg> ExtraAddrArgs = []; |
||
677 | bit Offset = false; |
||
678 | bit Bias = false; |
||
679 | bit ZCompare = false; |
||
680 | bit Gradients = false; |
||
681 | string LodClampMip = ""; |
||
682 | |||
683 | int NumRetAndDataAnyTypes = |
||
684 | !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, |
||
685 | !add(a, b.isAny)); |
||
686 | |||
687 | list<AMDGPUArg> AddrArgs = |
||
688 | arglistconcat<[ExtraAddrArgs, |
||
689 | !if(Gradients, dim.GradientArgs, []), |
||
690 | !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), |
||
691 | !if(!empty(LodClampMip), |
||
692 | []<AMDGPUArg>, |
||
693 | [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], |
||
694 | NumRetAndDataAnyTypes>.ret; |
||
695 | list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); |
||
696 | list<AMDGPUArg> AddrDefaultArgs = |
||
697 | !foreach(arg, AddrArgs, |
||
698 | AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), |
||
699 | !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), |
||
700 | arg.Name>); |
||
701 | list<AMDGPUArg> AddrA16Args = |
||
702 | !foreach(arg, AddrArgs, |
||
703 | AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), |
||
704 | !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), |
||
705 | arg.Name>); |
||
706 | } |
||
707 | |||
708 | class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { |
||
709 | let IsSample = base.IsSample; |
||
710 | let IsAtomic = base.IsAtomic; |
||
711 | let RetTypes = base.RetTypes; |
||
712 | let DataArgs = base.DataArgs; |
||
713 | let ExtraAddrArgs = base.ExtraAddrArgs; |
||
714 | let Offset = base.Offset; |
||
715 | let Bias = base.Bias; |
||
716 | let ZCompare = base.ZCompare; |
||
717 | let Gradients = base.Gradients; |
||
718 | let LodClampMip = base.LodClampMip; |
||
719 | } |
||
720 | |||
721 | class AMDGPUDimSampleProfile<string opmod, |
||
722 | AMDGPUDimProps dim, |
||
723 | AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { |
||
724 | let IsSample = true; |
||
725 | let RetTypes = [llvm_any_ty]; |
||
726 | let ExtraAddrArgs = sample.ExtraAddrArgs; |
||
727 | let Offset = sample.Offset; |
||
728 | let Bias = sample.Bias; |
||
729 | let ZCompare = sample.ZCompare; |
||
730 | let Gradients = sample.Gradients; |
||
731 | let LodClampMip = sample.LodOrClamp; |
||
732 | } |
||
733 | |||
734 | class AMDGPUDimNoSampleProfile<string opmod, |
||
735 | AMDGPUDimProps dim, |
||
736 | list<LLVMType> retty, |
||
737 | list<AMDGPUArg> dataargs, |
||
738 | bit Mip = false> : AMDGPUDimProfile<opmod, dim> { |
||
739 | let RetTypes = retty; |
||
740 | let DataArgs = dataargs; |
||
741 | let LodClampMip = !if(Mip, "mip", ""); |
||
742 | } |
||
743 | |||
744 | class AMDGPUDimAtomicProfile<string opmod, |
||
745 | AMDGPUDimProps dim, |
||
746 | list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { |
||
747 | let RetTypes = [llvm_anyint_ty]; |
||
748 | let DataArgs = dataargs; |
||
749 | let IsAtomic = true; |
||
750 | } |
||
751 | |||
752 | class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, |
||
753 | list<AMDGPUArg> dataargs> |
||
754 | : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { |
||
755 | let RetTypes = [llvm_anyfloat_ty]; |
||
756 | } |
||
757 | |||
758 | class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> |
||
759 | : AMDGPUDimProfile<"GET_RESINFO", dim> { |
||
760 | let RetTypes = [llvm_anyfloat_ty]; |
||
761 | let DataArgs = []; |
||
762 | let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; |
||
763 | let LodClampMip = "mip"; |
||
764 | } |
||
765 | |||
766 | // Helper class for figuring out image intrinsic argument indexes. |
||
767 | class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { |
||
768 | int NumDataArgs = !size(P_.DataArgs); |
||
769 | int NumDmaskArgs = !not(P_.IsAtomic); |
||
770 | int NumOffsetArgs = !if(P_.Offset, 1, 0); |
||
771 | int NumBiasArgs = !if(P_.Bias, 1, 0); |
||
772 | int NumZCompareArgs = !if(P_.ZCompare, 1, 0); |
||
773 | int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); |
||
774 | int NumVAddrArgs = !size(P_.AddrArgs); |
||
775 | int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); |
||
776 | int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); |
||
777 | int NumRSrcArgs = 1; |
||
778 | int NumSampArgs = !if(P_.IsSample, 2, 0); |
||
779 | int DmaskArgIndex = NumDataArgs; |
||
780 | int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); |
||
781 | int OffsetArgIndex = VAddrArgIndex; |
||
782 | int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); |
||
783 | int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); |
||
784 | int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); |
||
785 | int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); |
||
786 | int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); |
||
787 | int MipArgIndex = LodArgIndex; |
||
788 | int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); |
||
789 | int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); |
||
790 | int UnormArgIndex = !add(SampArgIndex, 1); |
||
791 | int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); |
||
792 | int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); |
||
793 | } |
||
794 | |||
795 | // All dimension-aware intrinsics are derived from this class. |
||
796 | class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, |
||
797 | list<IntrinsicProperty> props, |
||
798 | list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic< |
||
799 | P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return |
||
800 | !listconcat( |
||
801 | !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic |
||
802 | !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) |
||
803 | P_.AddrTypes, // vaddr(VGPR) |
||
804 | [llvm_v8i32_ty], // rsrc(SGPR) |
||
805 | !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) |
||
806 | llvm_i1_ty], []), // unorm(imm) |
||
807 | [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) |
||
808 | llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc) |
||
809 | |||
810 | !listconcat(props, |
||
811 | !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), |
||
812 | !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), |
||
813 | [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, |
||
814 | ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]), |
||
815 | |||
816 | |||
817 | "", sdnodeprops>, |
||
818 | AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), |
||
819 | !if(P_.IsAtomic, 0, 1)), 1> { |
||
820 | AMDGPUDimProfile P = P_; |
||
821 | |||
822 | AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); |
||
823 | |||
824 | let TargetPrefix = "amdgcn"; |
||
825 | } |
||
826 | |||
827 | // Marker class for intrinsics with a DMask that determines the returned |
||
828 | // channels. |
||
829 | class AMDGPUImageDMaskIntrinsic; |
||
830 | |||
831 | defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { |
||
832 | |||
833 | ////////////////////////////////////////////////////////////////////////// |
||
834 | // Load and store intrinsics |
||
835 | ////////////////////////////////////////////////////////////////////////// |
||
836 | multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, |
||
837 | list<LLVMType> retty, |
||
838 | list<AMDGPUArg> dataargs, |
||
839 | list<IntrinsicProperty> props, |
||
840 | list<SDNodeProperty> sdnodeprops, |
||
841 | bit Mip = false> { |
||
842 | foreach dim = AMDGPUDims.NoMsaa in { |
||
843 | def !strconcat(NAME, "_", dim.Name) |
||
844 | : AMDGPUImageDimIntrinsic< |
||
845 | AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, |
||
846 | props, sdnodeprops>; |
||
847 | } |
||
848 | } |
||
849 | |||
850 | multiclass AMDGPUImageDimIntrinsicsAll<string opmod, |
||
851 | list<LLVMType> retty, |
||
852 | list<AMDGPUArg> dataargs, |
||
853 | list<IntrinsicProperty> props, |
||
854 | list<SDNodeProperty> sdnodeprops, |
||
855 | bit Mip = false> { |
||
856 | foreach dim = AMDGPUDims.All in { |
||
857 | def !strconcat(NAME, "_", dim.Name) |
||
858 | : AMDGPUImageDimIntrinsic< |
||
859 | AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, |
||
860 | props, sdnodeprops>; |
||
861 | } |
||
862 | } |
||
863 | |||
864 | defm int_amdgcn_image_load |
||
865 | : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], |
||
866 | [SDNPMemOperand]>, |
||
867 | AMDGPUImageDMaskIntrinsic; |
||
868 | defm int_amdgcn_image_load_mip |
||
869 | : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], |
||
870 | [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, |
||
871 | AMDGPUImageDMaskIntrinsic; |
||
872 | |||
873 | defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< |
||
874 | "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], |
||
875 | [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>; |
||
876 | defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< |
||
877 | "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], |
||
878 | [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>; |
||
879 | |||
880 | ////////////////////////////////////////////////////////////////////////// |
||
881 | // MSAA intrinsics |
||
882 | ////////////////////////////////////////////////////////////////////////// |
||
883 | foreach dim = AMDGPUDims.Msaa in { |
||
884 | def int_amdgcn_image_msaa_load_x # _ # dim.Name: |
||
885 | AMDGPUImageDimIntrinsic< |
||
886 | AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, |
||
887 | [IntrReadMem], [SDNPMemOperand]>; |
||
888 | } |
||
889 | |||
890 | foreach dim = AMDGPUDims.Msaa in { |
||
891 | def int_amdgcn_image_msaa_load # _ # dim.Name: |
||
892 | AMDGPUImageDimIntrinsic< |
||
893 | AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, |
||
894 | [IntrReadMem], [SDNPMemOperand]>; |
||
895 | } |
||
896 | |||
897 | ////////////////////////////////////////////////////////////////////////// |
||
898 | // sample and getlod intrinsics |
||
899 | ////////////////////////////////////////////////////////////////////////// |
||
900 | multiclass AMDGPUImageDimSampleDims<string opmod, |
||
901 | AMDGPUSampleVariant sample, |
||
902 | bit NoMem = false> { |
||
903 | foreach dim = AMDGPUDims.NoMsaa in { |
||
904 | def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< |
||
905 | AMDGPUDimSampleProfile<opmod, dim, sample>, |
||
906 | !if(NoMem, [IntrNoMem], [IntrReadMem]), |
||
907 | !if(NoMem, [], [SDNPMemOperand])>; |
||
908 | } |
||
909 | } |
||
910 | |||
911 | foreach sample = AMDGPUSampleVariants in { |
||
912 | defm int_amdgcn_image_sample # sample.LowerCaseMod |
||
913 | : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, |
||
914 | AMDGPUImageDMaskIntrinsic; |
||
915 | } |
||
916 | |||
917 | defm int_amdgcn_image_getlod |
||
918 | : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, |
||
919 | AMDGPUImageDMaskIntrinsic; |
||
920 | |||
921 | ////////////////////////////////////////////////////////////////////////// |
||
922 | // getresinfo intrinsics |
||
923 | ////////////////////////////////////////////////////////////////////////// |
||
924 | foreach dim = AMDGPUDims.All in { |
||
925 | def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) |
||
926 | : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, |
||
927 | AMDGPUImageDMaskIntrinsic; |
||
928 | } |
||
929 | |||
930 | ////////////////////////////////////////////////////////////////////////// |
||
931 | // gather4 intrinsics |
||
932 | ////////////////////////////////////////////////////////////////////////// |
||
933 | foreach sample = AMDGPUSampleVariantsNoGradients in { |
||
934 | foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { |
||
935 | def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: |
||
936 | AMDGPUImageDimIntrinsic< |
||
937 | AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, |
||
938 | [IntrReadMem], [SDNPMemOperand]>; |
||
939 | } |
||
940 | } |
||
941 | } |
||
942 | |||
943 | ////////////////////////////////////////////////////////////////////////// |
||
944 | // atomic intrinsics |
||
945 | ////////////////////////////////////////////////////////////////////////// |
||
946 | defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { |
||
947 | multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, |
||
948 | int isFloat = 0> { |
||
949 | foreach dim = AMDGPUDims.All in { |
||
950 | def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< |
||
951 | !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, |
||
952 | AMDGPUDimAtomicProfile<opmod, dim, dataargs>), |
||
953 | [], [SDNPMemOperand]>; |
||
954 | } |
||
955 | } |
||
956 | |||
957 | multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { |
||
958 | defm "" |
||
959 | : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], |
||
960 | isFloat>; |
||
961 | } |
||
962 | |||
963 | multiclass AMDGPUImageDimFloatAtomic<string opmod> { |
||
964 | defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; |
||
965 | } |
||
966 | |||
967 | defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; |
||
968 | defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; |
||
969 | defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; |
||
970 | defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; |
||
971 | defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; |
||
972 | defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; |
||
973 | defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; |
||
974 | defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; |
||
975 | defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; |
||
976 | defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; |
||
977 | defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; |
||
978 | defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; |
||
979 | defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; |
||
980 | defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; |
||
981 | |||
982 | defm int_amdgcn_image_atomic_cmpswap : |
||
983 | AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, |
||
984 | AMDGPUArg<LLVMMatchType<0>, "cmp">]>; |
||
985 | } |
||
986 | |||
987 | ////////////////////////////////////////////////////////////////////////// |
||
988 | // Buffer intrinsics |
||
989 | ////////////////////////////////////////////////////////////////////////// |
||
990 | |||
991 | let TargetPrefix = "amdgcn" in { |
||
992 | |||
993 | defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { |
||
994 | |||
995 | class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
996 | [data_ty], |
||
997 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
998 | llvm_i32_ty, // vindex(VGPR) |
||
999 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
1000 | llvm_i1_ty, // glc(imm) |
||
1001 | llvm_i1_ty], // slc(imm) |
||
1002 | [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
1003 | AMDGPURsrcIntrinsic<0>; |
||
1004 | def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; |
||
1005 | def int_amdgcn_buffer_load : AMDGPUBufferLoad; |
||
1006 | |||
1007 | // Generate a buffer_load instruction that may be optimized to s_buffer_load if |
||
1008 | // the offset argument is uniform. |
||
1009 | def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < |
||
1010 | [llvm_any_ty], |
||
1011 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1012 | llvm_i32_ty, // byte offset |
||
1013 | llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) |
||
1014 | [IntrNoMem, ImmArg<ArgIndex<2>>]>, |
||
1015 | AMDGPURsrcIntrinsic<0>; |
||
1016 | |||
1017 | class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
1018 | [], |
||
1019 | [data_ty, // vdata(VGPR) |
||
1020 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1021 | llvm_i32_ty, // vindex(VGPR) |
||
1022 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
1023 | llvm_i1_ty, // glc(imm) |
||
1024 | llvm_i1_ty], // slc(imm) |
||
1025 | [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
1026 | AMDGPURsrcIntrinsic<1>; |
||
1027 | def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; |
||
1028 | def int_amdgcn_buffer_store : AMDGPUBufferStore; |
||
1029 | |||
1030 | // New buffer intrinsics with separate raw and struct variants. The raw |
||
1031 | // variant never has an index. The struct variant always has an index, even if |
||
1032 | // it is const 0. A struct intrinsic with constant 0 index is different to the |
||
1033 | // corresponding raw intrinsic on gfx9+ because the behavior of bound checking |
||
1034 | // and swizzling changes depending on whether idxen is set in the instruction. |
||
1035 | // These new instrinsics also keep the offset and soffset arguments separate as |
||
1036 | // they behave differently in bounds checking and swizzling. |
||
1037 | class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
1038 | [data_ty], |
||
1039 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1040 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1041 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1042 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1043 | // bit 1 = slc, |
||
1044 | // bit 2 = dlc on gfx10+), |
||
1045 | // swizzled buffer (bit 3 = swz)) |
||
1046 | [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, |
||
1047 | AMDGPURsrcIntrinsic<0>; |
||
1048 | def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; |
||
1049 | def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; |
||
1050 | |||
1051 | class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
1052 | [data_ty], |
||
1053 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1054 | llvm_i32_ty, // vindex(VGPR) |
||
1055 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1056 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1057 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1058 | // bit 1 = slc, |
||
1059 | // bit 2 = dlc on gfx10+), |
||
1060 | // swizzled buffer (bit 3 = swz)) |
||
1061 | [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
1062 | AMDGPURsrcIntrinsic<0>; |
||
1063 | def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; |
||
1064 | def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; |
||
1065 | |||
1066 | class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
1067 | [], |
||
1068 | [data_ty, // vdata(VGPR) |
||
1069 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1070 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1071 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1072 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1073 | // bit 1 = slc, |
||
1074 | // bit 2 = dlc on gfx10+), |
||
1075 | // swizzled buffer (bit 3 = swz)) |
||
1076 | [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
1077 | AMDGPURsrcIntrinsic<1>; |
||
1078 | def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; |
||
1079 | def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; |
||
1080 | |||
1081 | class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
1082 | [], |
||
1083 | [data_ty, // vdata(VGPR) |
||
1084 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1085 | llvm_i32_ty, // vindex(VGPR) |
||
1086 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1087 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1088 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1089 | // bit 1 = slc, |
||
1090 | // bit 2 = dlc on gfx10+), |
||
1091 | // swizzled buffer (bit 3 = swz)) |
||
1092 | [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
1093 | AMDGPURsrcIntrinsic<1>; |
||
1094 | def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; |
||
1095 | def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; |
||
1096 | |||
1097 | class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < |
||
1098 | !if(NoRtn, [], [data_ty]), |
||
1099 | [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) |
||
1100 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1101 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1102 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1103 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
1104 | [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1105 | AMDGPURsrcIntrinsic<1, 0>; |
||
1106 | def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; |
||
1107 | def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; |
||
1108 | def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; |
||
1109 | def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; |
||
1110 | def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; |
||
1111 | def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; |
||
1112 | def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; |
||
1113 | def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; |
||
1114 | def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; |
||
1115 | def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; |
||
1116 | def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; |
||
1117 | def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; |
||
1118 | def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; |
||
1119 | def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; |
||
1120 | def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< |
||
1121 | [llvm_anyint_ty], |
||
1122 | [LLVMMatchType<0>, // src(VGPR) |
||
1123 | LLVMMatchType<0>, // cmp(VGPR) |
||
1124 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1125 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1126 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1127 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
1128 | [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1129 | AMDGPURsrcIntrinsic<2, 0>; |
||
1130 | |||
1131 | // gfx908 intrinsic |
||
1132 | def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; |
||
1133 | |||
1134 | class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < |
||
1135 | !if(NoRtn, [], [data_ty]), |
||
1136 | [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) |
||
1137 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1138 | llvm_i32_ty, // vindex(VGPR) |
||
1139 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1140 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1141 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
1142 | [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1143 | AMDGPURsrcIntrinsic<1, 0>; |
||
1144 | def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; |
||
1145 | def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; |
||
1146 | def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; |
||
1147 | def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; |
||
1148 | def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; |
||
1149 | def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; |
||
1150 | def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; |
||
1151 | def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; |
||
1152 | def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; |
||
1153 | def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; |
||
1154 | def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; |
||
1155 | def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; |
||
1156 | def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< |
||
1157 | [llvm_anyint_ty], |
||
1158 | [LLVMMatchType<0>, // src(VGPR) |
||
1159 | LLVMMatchType<0>, // cmp(VGPR) |
||
1160 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1161 | llvm_i32_ty, // vindex(VGPR) |
||
1162 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1163 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1164 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
1165 | [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1166 | AMDGPURsrcIntrinsic<2, 0>; |
||
1167 | |||
1168 | // gfx908 intrinsic |
||
1169 | def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; |
||
1170 | |||
1171 | // gfx90a intrinsics |
||
1172 | def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; |
||
1173 | def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; |
||
1174 | |||
1175 | |||
1176 | // Obsolescent tbuffer intrinsics. |
||
1177 | def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < |
||
1178 | [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
1179 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1180 | llvm_i32_ty, // vindex(VGPR) |
||
1181 | llvm_i32_ty, // voffset(VGPR) |
||
1182 | llvm_i32_ty, // soffset(SGPR) |
||
1183 | llvm_i32_ty, // offset(imm) |
||
1184 | llvm_i32_ty, // dfmt(imm) |
||
1185 | llvm_i32_ty, // nfmt(imm) |
||
1186 | llvm_i1_ty, // glc(imm) |
||
1187 | llvm_i1_ty], // slc(imm) |
||
1188 | [IntrReadMem, |
||
1189 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, |
||
1190 | ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, |
||
1191 | AMDGPURsrcIntrinsic<0>; |
||
1192 | |||
1193 | def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < |
||
1194 | [], |
||
1195 | [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
1196 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1197 | llvm_i32_ty, // vindex(VGPR) |
||
1198 | llvm_i32_ty, // voffset(VGPR) |
||
1199 | llvm_i32_ty, // soffset(SGPR) |
||
1200 | llvm_i32_ty, // offset(imm) |
||
1201 | llvm_i32_ty, // dfmt(imm) |
||
1202 | llvm_i32_ty, // nfmt(imm) |
||
1203 | llvm_i1_ty, // glc(imm) |
||
1204 | llvm_i1_ty], // slc(imm) |
||
1205 | [IntrWriteMem, ImmArg<ArgIndex<5>>, |
||
1206 | ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, |
||
1207 | ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, |
||
1208 | AMDGPURsrcIntrinsic<1>; |
||
1209 | |||
1210 | // New tbuffer intrinsics, with: |
||
1211 | // - raw and struct variants |
||
1212 | // - joint format field |
||
1213 | // - joint cachepolicy field |
||
1214 | def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < |
||
1215 | [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
1216 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1217 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1218 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1219 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
1220 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1221 | // bit 1 = slc, |
||
1222 | // bit 2 = dlc on gfx10+), |
||
1223 | // swizzled buffer (bit 3 = swz)) |
||
1224 | [IntrReadMem, |
||
1225 | ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
1226 | AMDGPURsrcIntrinsic<0>; |
||
1227 | |||
1228 | def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < |
||
1229 | [], |
||
1230 | [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
1231 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1232 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1233 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1234 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
1235 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1236 | // bit 1 = slc, |
||
1237 | // bit 2 = dlc on gfx10+), |
||
1238 | // swizzled buffer (bit 3 = swz)) |
||
1239 | [IntrWriteMem, |
||
1240 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
1241 | AMDGPURsrcIntrinsic<1>; |
||
1242 | |||
1243 | def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < |
||
1244 | [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
1245 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1246 | llvm_i32_ty, // vindex(VGPR) |
||
1247 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1248 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1249 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
1250 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1251 | // bit 1 = slc, |
||
1252 | // bit 2 = dlc on gfx10+), |
||
1253 | // swizzled buffer (bit 3 = swz)) |
||
1254 | [IntrReadMem, |
||
1255 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
1256 | AMDGPURsrcIntrinsic<0>; |
||
1257 | |||
1258 | def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < |
||
1259 | [], |
||
1260 | [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
1261 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1262 | llvm_i32_ty, // vindex(VGPR) |
||
1263 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
1264 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1265 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
1266 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1267 | // bit 1 = slc, |
||
1268 | // bit 2 = dlc on gfx10+), |
||
1269 | // swizzled buffer (bit 3 = swz)) |
||
1270 | [IntrWriteMem, |
||
1271 | ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, |
||
1272 | AMDGPURsrcIntrinsic<1>; |
||
1273 | |||
1274 | class AMDGPUBufferAtomic : Intrinsic < |
||
1275 | [llvm_anyint_ty], |
||
1276 | [LLVMMatchType<0>, // vdata(VGPR) |
||
1277 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1278 | llvm_i32_ty, // vindex(VGPR) |
||
1279 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
1280 | llvm_i1_ty], // slc(imm) |
||
1281 | [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1282 | AMDGPURsrcIntrinsic<1, 0>; |
||
1283 | def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; |
||
1284 | def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; |
||
1285 | def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; |
||
1286 | def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; |
||
1287 | def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; |
||
1288 | def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; |
||
1289 | def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; |
||
1290 | def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; |
||
1291 | def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; |
||
1292 | def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; |
||
1293 | def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< |
||
1294 | [llvm_i32_ty], |
||
1295 | [llvm_i32_ty, // src(VGPR) |
||
1296 | llvm_i32_ty, // cmp(VGPR) |
||
1297 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1298 | llvm_i32_ty, // vindex(VGPR) |
||
1299 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
1300 | llvm_i1_ty], // slc(imm) |
||
1301 | [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1302 | AMDGPURsrcIntrinsic<2, 0>; |
||
1303 | |||
1304 | def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; |
||
1305 | |||
1306 | class AMDGPUBufferAtomicFP : Intrinsic < |
||
1307 | [llvm_anyfloat_ty], |
||
1308 | [LLVMMatchType<0>, // vdata(VGPR) |
||
1309 | llvm_v4i32_ty, // rsrc(SGPR) |
||
1310 | llvm_i32_ty, // vindex(VGPR) |
||
1311 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
1312 | llvm_i1_ty], // slc(imm) |
||
1313 | [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
1314 | AMDGPURsrcIntrinsic<1, 0>; |
||
1315 | |||
1316 | // Legacy form of the intrinsic. raw and struct forms should be preferred. |
||
1317 | def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; |
||
1318 | |||
1319 | class AMDGPURawBufferLoadLDS : Intrinsic < |
||
1320 | [], |
||
1321 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1322 | LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset |
||
1323 | llvm_i32_ty, // Data byte size: 1/2/4 |
||
1324 | llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) |
||
1325 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1326 | llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) |
||
1327 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1328 | // bit 1 = slc, |
||
1329 | // bit 2 = dlc on gfx10+)) |
||
1330 | // swizzled buffer (bit 3 = swz)) |
||
1331 | [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, |
||
1332 | ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; |
||
1333 | def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; |
||
1334 | |||
1335 | class AMDGPUStructBufferLoadLDS : Intrinsic < |
||
1336 | [], |
||
1337 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
1338 | LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset |
||
1339 | llvm_i32_ty, // Data byte size: 1/2/4 |
||
1340 | llvm_i32_ty, // vindex(VGPR) |
||
1341 | llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) |
||
1342 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
1343 | llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) |
||
1344 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
1345 | // bit 1 = slc, |
||
1346 | // bit 2 = dlc on gfx10+)) |
||
1347 | // swizzled buffer (bit 3 = swz)) |
||
1348 | [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, |
||
1349 | ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; |
||
1350 | def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; |
||
1351 | |||
1352 | } // defset AMDGPUBufferIntrinsics |
||
1353 | |||
1354 | // Uses that do not set the done bit should set IntrWriteMem on the |
||
1355 | // call site. |
||
1356 | def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ |
||
1357 | llvm_i32_ty, // tgt, |
||
1358 | llvm_i32_ty, // en |
||
1359 | llvm_any_ty, // src0 (f32 or i32) |
||
1360 | LLVMMatchType<0>, // src1 |
||
1361 | LLVMMatchType<0>, // src2 |
||
1362 | LLVMMatchType<0>, // src3 |
||
1363 | llvm_i1_ty, // done |
||
1364 | llvm_i1_ty // vm (ignored on GFX11+) |
||
1365 | ], |
||
1366 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, |
||
1367 | ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] |
||
1368 | >; |
||
1369 | |||
1370 | // exp with row_en bit set. Only supported on GFX11+. |
||
1371 | def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ |
||
1372 | llvm_i32_ty, // tgt, |
||
1373 | llvm_i32_ty, // en |
||
1374 | llvm_any_ty, // src0 (f32 or i32) |
||
1375 | LLVMMatchType<0>, // src1 |
||
1376 | LLVMMatchType<0>, // src2 |
||
1377 | LLVMMatchType<0>, // src3 |
||
1378 | llvm_i1_ty, // done |
||
1379 | llvm_i32_ty], // row number |
||
1380 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, |
||
1381 | IntrWriteMem, IntrInaccessibleMemOnly] |
||
1382 | >; |
||
1383 | |||
1384 | // exp with compr bit set. Not supported on GFX11+. |
||
1385 | def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ |
||
1386 | llvm_i32_ty, // tgt, |
||
1387 | llvm_i32_ty, // en |
||
1388 | llvm_anyvector_ty, // src0 (v2f16 or v2i16) |
||
1389 | LLVMMatchType<0>, // src1 |
||
1390 | llvm_i1_ty, // done |
||
1391 | llvm_i1_ty], // vm |
||
1392 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, |
||
1393 | ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] |
||
1394 | >; |
||
1395 | |||
1396 | def int_amdgcn_buffer_wbinvl1_sc : |
||
1397 | ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, |
||
1398 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
1399 | |||
1400 | def int_amdgcn_buffer_wbinvl1 : |
||
1401 | ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, |
||
1402 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
1403 | |||
1404 | def int_amdgcn_s_dcache_inv : |
||
1405 | ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, |
||
1406 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
1407 | |||
1408 | def int_amdgcn_s_memtime : |
||
1409 | ClangBuiltin<"__builtin_amdgcn_s_memtime">, |
||
1410 | DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; |
||
1411 | |||
1412 | def int_amdgcn_s_sleep : |
||
1413 | ClangBuiltin<"__builtin_amdgcn_s_sleep">, |
||
1414 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
1415 | IntrHasSideEffects]> { |
||
1416 | } |
||
1417 | |||
1418 | def int_amdgcn_s_incperflevel : |
||
1419 | ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, |
||
1420 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
1421 | IntrHasSideEffects]> { |
||
1422 | } |
||
1423 | |||
1424 | def int_amdgcn_s_decperflevel : |
||
1425 | ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, |
||
1426 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
1427 | IntrHasSideEffects]> { |
||
1428 | } |
||
1429 | |||
1430 | def int_amdgcn_s_sethalt : |
||
1431 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
1432 | IntrHasSideEffects]>; |
||
1433 | |||
1434 | def int_amdgcn_s_setprio : |
||
1435 | ClangBuiltin<"__builtin_amdgcn_s_setprio">, |
||
1436 | DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
1437 | IntrHasSideEffects]>; |
||
1438 | |||
1439 | // This is IntrHasSideEffects so it can be used to read cycle counters. |
||
1440 | def int_amdgcn_s_getreg : |
||
1441 | ClangBuiltin<"__builtin_amdgcn_s_getreg">, |
||
1442 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], |
||
1443 | [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] |
||
1444 | >; |
||
1445 | |||
1446 | // Note this can be used to set FP environment properties that are |
||
1447 | // unsafe to change in non-strictfp functions. The register properties |
||
1448 | // available (and value required to access them) may differ per |
||
1449 | // subtarget. llvm.amdgcn.s.setreg(hwmode, value) |
||
1450 | def int_amdgcn_s_setreg : |
||
1451 | ClangBuiltin<"__builtin_amdgcn_s_setreg">, |
||
1452 | DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], |
||
1453 | [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] |
||
1454 | >; |
||
1455 | |||
1456 | // int_amdgcn_s_getpc is provided to allow a specific style of position |
||
1457 | // independent code to determine the high part of its address when it is |
||
1458 | // known (through convention) that the code and any data of interest does |
||
1459 | // not cross a 4Gb address boundary. Use for any other purpose may not |
||
1460 | // produce the desired results as optimizations may cause code movement, |
||
1461 | // especially as we explicitly use IntrNoMem to allow optimizations. |
||
1462 | def int_amdgcn_s_getpc : |
||
1463 | ClangBuiltin<"__builtin_amdgcn_s_getpc">, |
||
1464 | DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, |
||
1465 | IntrWillReturn]>; |
||
1466 | |||
1467 | // __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> |
||
1468 | // param values: 0 = P10, 1 = P20, 2 = P0 |
||
1469 | def int_amdgcn_interp_mov : |
||
1470 | ClangBuiltin<"__builtin_amdgcn_interp_mov">, |
||
1471 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1472 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1473 | [IntrNoMem, IntrSpeculatable, |
||
1474 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; |
||
1475 | |||
1476 | // __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> |
||
1477 | // This intrinsic reads from lds, but the memory values are constant, |
||
1478 | // so it behaves like IntrNoMem. |
||
1479 | def int_amdgcn_interp_p1 : |
||
1480 | ClangBuiltin<"__builtin_amdgcn_interp_p1">, |
||
1481 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1482 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1483 | [IntrNoMem, IntrSpeculatable, |
||
1484 | ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; |
||
1485 | |||
1486 | // __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> |
||
1487 | def int_amdgcn_interp_p2 : |
||
1488 | ClangBuiltin<"__builtin_amdgcn_interp_p2">, |
||
1489 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1490 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1491 | [IntrNoMem, IntrSpeculatable, |
||
1492 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; |
||
1493 | // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. |
||
1494 | |||
1495 | // __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> |
||
1496 | // high selects whether high or low 16-bits are loaded from LDS |
||
1497 | def int_amdgcn_interp_p1_f16 : |
||
1498 | ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, |
||
1499 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1500 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], |
||
1501 | [IntrNoMem, IntrSpeculatable, |
||
1502 | ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; |
||
1503 | |||
1504 | // __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> |
||
1505 | // high selects whether high or low 16-bits are loaded from LDS |
||
1506 | def int_amdgcn_interp_p2_f16 : |
||
1507 | ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, |
||
1508 | DefaultAttrsIntrinsic<[llvm_half_ty], |
||
1509 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], |
||
1510 | [IntrNoMem, IntrSpeculatable, |
||
1511 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; |
||
1512 | |||
1513 | // llvm.amdgcn.lds.direct.load <m0> |
||
1514 | // The input argument is m0, which contains a packed combination of address |
||
1515 | // offset and flags describing the data type. |
||
1516 | def int_amdgcn_lds_direct_load : |
||
1517 | DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 |
||
1518 | [llvm_i32_ty], |
||
1519 | [IntrReadMem, IntrSpeculatable]>; |
||
1520 | |||
1521 | // llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> |
||
1522 | // Like interp intrinsics, this reads from lds, but the memory values are constant, |
||
1523 | // so it behaves like IntrNoMem. |
||
1524 | def int_amdgcn_lds_param_load : |
||
1525 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1526 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1527 | [IntrNoMem, IntrSpeculatable, |
||
1528 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; |
||
1529 | |||
1530 | // llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> |
||
1531 | def int_amdgcn_interp_inreg_p10 : |
||
1532 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1533 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
1534 | [IntrNoMem, IntrSpeculatable]>; |
||
1535 | |||
1536 | // llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> |
||
1537 | def int_amdgcn_interp_inreg_p2 : |
||
1538 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1539 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
1540 | [IntrNoMem, IntrSpeculatable]>; |
||
1541 | |||
1542 | // llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> |
||
1543 | // high selects whether high or low 16-bits are used for p and p0 operands |
||
1544 | def int_amdgcn_interp_inreg_p10_f16: |
||
1545 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
1546 | [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], |
||
1547 | [IntrNoMem, IntrSpeculatable, |
||
1548 | ImmArg<ArgIndex<3>>]>; |
||
1549 | |||
1550 | // llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> |
||
1551 | // high selects whether high or low 16-bits are used for p operand |
||
1552 | def int_amdgcn_interp_inreg_p2_f16 : |
||
1553 | DefaultAttrsIntrinsic<[llvm_half_ty], |
||
1554 | [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], |
||
1555 | [IntrNoMem, IntrSpeculatable, |
||
1556 | ImmArg<ArgIndex<3>>]>; |
||
1557 | |||
1558 | // Deprecated: use llvm.amdgcn.live.mask instead. |
||
1559 | def int_amdgcn_ps_live : DefaultAttrsIntrinsic < |
||
1560 | [llvm_i1_ty], |
||
1561 | [], |
||
1562 | [IntrNoMem]>; |
||
1563 | |||
1564 | // Query currently live lanes. |
||
1565 | // Returns true if lane is live (and not a helper lane). |
||
1566 | def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], |
||
1567 | [], [IntrReadMem, IntrInaccessibleMemOnly] |
||
1568 | >; |
||
1569 | |||
1570 | def int_amdgcn_mbcnt_lo : |
||
1571 | ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, |
||
1572 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1573 | [IntrNoMem]>; |
||
1574 | |||
1575 | def int_amdgcn_mbcnt_hi : |
||
1576 | ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, |
||
1577 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1578 | [IntrNoMem]>; |
||
1579 | |||
1580 | // llvm.amdgcn.ds.swizzle src offset |
||
1581 | def int_amdgcn_ds_swizzle : |
||
1582 | ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, |
||
1583 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1584 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, |
||
1585 | ImmArg<ArgIndex<1>>]>; |
||
1586 | |||
1587 | def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], |
||
1588 | [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], |
||
1589 | [IntrNoMem, IntrSpeculatable] |
||
1590 | >; |
||
1591 | |||
1592 | def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], |
||
1593 | [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], |
||
1594 | [IntrNoMem, IntrSpeculatable] |
||
1595 | >; |
||
1596 | |||
1597 | def int_amdgcn_lerp : |
||
1598 | ClangBuiltin<"__builtin_amdgcn_lerp">, |
||
1599 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1600 | [IntrNoMem, IntrSpeculatable] |
||
1601 | >; |
||
1602 | |||
1603 | def int_amdgcn_sad_u8 : |
||
1604 | ClangBuiltin<"__builtin_amdgcn_sad_u8">, |
||
1605 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1606 | [IntrNoMem, IntrSpeculatable] |
||
1607 | >; |
||
1608 | |||
1609 | def int_amdgcn_msad_u8 : |
||
1610 | ClangBuiltin<"__builtin_amdgcn_msad_u8">, |
||
1611 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1612 | [IntrNoMem, IntrSpeculatable] |
||
1613 | >; |
||
1614 | |||
1615 | def int_amdgcn_sad_hi_u8 : |
||
1616 | ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, |
||
1617 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1618 | [IntrNoMem, IntrSpeculatable] |
||
1619 | >; |
||
1620 | |||
1621 | def int_amdgcn_sad_u16 : |
||
1622 | ClangBuiltin<"__builtin_amdgcn_sad_u16">, |
||
1623 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1624 | [IntrNoMem, IntrSpeculatable] |
||
1625 | >; |
||
1626 | |||
1627 | def int_amdgcn_qsad_pk_u16_u8 : |
||
1628 | ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, |
||
1629 | DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], |
||
1630 | [IntrNoMem, IntrSpeculatable] |
||
1631 | >; |
||
1632 | |||
1633 | def int_amdgcn_mqsad_pk_u16_u8 : |
||
1634 | ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, |
||
1635 | DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], |
||
1636 | [IntrNoMem, IntrSpeculatable] |
||
1637 | >; |
||
1638 | |||
1639 | def int_amdgcn_mqsad_u32_u8 : |
||
1640 | ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, |
||
1641 | DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], |
||
1642 | [IntrNoMem, IntrSpeculatable] |
||
1643 | >; |
||
1644 | |||
1645 | def int_amdgcn_cvt_pk_u8_f32 : |
||
1646 | ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, |
||
1647 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], |
||
1648 | [IntrNoMem, IntrSpeculatable] |
||
1649 | >; |
||
1650 | |||
1651 | def int_amdgcn_icmp : |
||
1652 | Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], |
||
1653 | [IntrNoMem, IntrConvergent, |
||
1654 | ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1655 | |||
1656 | def int_amdgcn_fcmp : |
||
1657 | Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], |
||
1658 | [IntrNoMem, IntrConvergent, |
||
1659 | ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1660 | |||
1661 | def int_amdgcn_ballot : |
||
1662 | Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], |
||
1663 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1664 | |||
1665 | def int_amdgcn_readfirstlane : |
||
1666 | ClangBuiltin<"__builtin_amdgcn_readfirstlane">, |
||
1667 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty], |
||
1668 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1669 | |||
1670 | // The lane argument must be uniform across the currently active threads of the |
||
1671 | // current wave. Otherwise, the result is undefined. |
||
1672 | def int_amdgcn_readlane : |
||
1673 | ClangBuiltin<"__builtin_amdgcn_readlane">, |
||
1674 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1675 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1676 | |||
1677 | // The value to write and lane select arguments must be uniform across the |
||
1678 | // currently active threads of the current wave. Otherwise, the result is |
||
1679 | // undefined. |
||
1680 | def int_amdgcn_writelane : |
||
1681 | ClangBuiltin<"__builtin_amdgcn_writelane">, |
||
1682 | Intrinsic<[llvm_i32_ty], [ |
||
1683 | llvm_i32_ty, // uniform value to write: returned by the selected lane |
||
1684 | llvm_i32_ty, // uniform lane select |
||
1685 | llvm_i32_ty // returned by all lanes other than the selected one |
||
1686 | ], |
||
1687 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1688 | >; |
||
1689 | |||
1690 | def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, |
||
1691 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1692 | [IntrNoMem, IntrSpeculatable] |
||
1693 | >; |
||
1694 | |||
1695 | def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
1696 | [llvm_i32_ty, llvm_i32_ty], |
||
1697 | [IntrNoMem, IntrSpeculatable] |
||
1698 | >; |
||
1699 | |||
1700 | def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
1701 | [llvm_i32_ty, llvm_i32_ty], |
||
1702 | [IntrNoMem, IntrSpeculatable] |
||
1703 | >; |
||
1704 | |||
1705 | def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
1706 | [llvm_i32_ty, llvm_i32_ty], |
||
1707 | [IntrNoMem, IntrSpeculatable] |
||
1708 | >; |
||
1709 | |||
1710 | def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
1711 | [llvm_i32_ty, llvm_i32_ty], |
||
1712 | [IntrNoMem, IntrSpeculatable] |
||
1713 | >; |
||
1714 | |||
1715 | // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) |
||
1716 | // |
||
1717 | // bar_val is the total number of waves that will wait on this |
||
1718 | // barrier, minus 1. |
||
1719 | def int_amdgcn_ds_gws_init : |
||
1720 | ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, |
||
1721 | Intrinsic<[], |
||
1722 | [llvm_i32_ty, llvm_i32_ty], |
||
1723 | [IntrConvergent, IntrWriteMem, |
||
1724 | IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
1725 | [SDNPMemOperand] |
||
1726 | >; |
||
1727 | |||
1728 | // llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) |
||
1729 | // bar_val is the total number of waves that will wait on this |
||
1730 | // barrier, minus 1. |
||
1731 | def int_amdgcn_ds_gws_barrier : |
||
1732 | ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, |
||
1733 | Intrinsic<[], |
||
1734 | [llvm_i32_ty, llvm_i32_ty], |
||
1735 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
1736 | [SDNPMemOperand] |
||
1737 | >; |
||
1738 | |||
1739 | // llvm.amdgcn.ds.gws.sema.v(i32 resource_id) |
||
1740 | def int_amdgcn_ds_gws_sema_v : |
||
1741 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, |
||
1742 | Intrinsic<[], |
||
1743 | [llvm_i32_ty], |
||
1744 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
1745 | [SDNPMemOperand] |
||
1746 | >; |
||
1747 | |||
1748 | // llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) |
||
1749 | def int_amdgcn_ds_gws_sema_br : |
||
1750 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, |
||
1751 | Intrinsic<[], |
||
1752 | [llvm_i32_ty, llvm_i32_ty], |
||
1753 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
1754 | [SDNPMemOperand] |
||
1755 | >; |
||
1756 | |||
1757 | // llvm.amdgcn.ds.gws.sema.p(i32 resource_id) |
||
1758 | def int_amdgcn_ds_gws_sema_p : |
||
1759 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, |
||
1760 | Intrinsic<[], |
||
1761 | [llvm_i32_ty], |
||
1762 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
1763 | [SDNPMemOperand] |
||
1764 | >; |
||
1765 | |||
1766 | // llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) |
||
1767 | def int_amdgcn_ds_gws_sema_release_all : |
||
1768 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, |
||
1769 | Intrinsic<[], |
||
1770 | [llvm_i32_ty], |
||
1771 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
1772 | [SDNPMemOperand] |
||
1773 | >; |
||
1774 | |||
1775 | |||
1776 | // Copies the source value to the destination value, with the guarantee that |
||
1777 | // the source value is computed as if the entire program were executed in WQM. |
||
1778 | def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], |
||
1779 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1780 | >; |
||
1781 | |||
1782 | // Copies the source value to the destination value, such that the source |
||
1783 | // is computed as if the entire program were executed in WQM if any other |
||
1784 | // program code executes in WQM. |
||
1785 | def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], |
||
1786 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1787 | >; |
||
1788 | |||
1789 | // Return true if at least one thread within the pixel quad passes true into |
||
1790 | // the function. |
||
1791 | def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], |
||
1792 | [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1793 | >; |
||
1794 | |||
1795 | // If false, set EXEC=0 for the current thread until the end of program. |
||
1796 | // FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? |
||
1797 | def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; |
||
1798 | |||
1799 | def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, |
||
1800 | Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrNoCallback, IntrNoFree] |
||
1801 | >; |
||
1802 | |||
1803 | // If false, mark all active lanes as helper lanes until the end of program. |
||
1804 | def int_amdgcn_wqm_demote : Intrinsic<[], |
||
1805 | [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] |
||
1806 | >; |
||
1807 | |||
1808 | // Copies the active channels of the source value to the destination value, |
||
1809 | // with the guarantee that the source value is computed as if the entire |
||
1810 | // program were executed in Whole Wavefront Mode, i.e. with all channels |
||
1811 | // enabled, with a few exceptions: - Phi nodes which require WWM return an |
||
1812 | // undefined value. |
||
1813 | def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], |
||
1814 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, |
||
1815 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1816 | >; |
||
1817 | // Deprecated. Use int_amdgcn_strict_wwm instead. |
||
1818 | def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], |
||
1819 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, |
||
1820 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1821 | >; |
||
1822 | def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], |
||
1823 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, |
||
1824 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
1825 | >; |
||
1826 | |||
1827 | // Given a value, copies it while setting all the inactive lanes to a given |
||
1828 | // value. Note that OpenGL helper lanes are considered active, so if the |
||
1829 | // program ever uses WQM, then the instruction and the first source will be |
||
1830 | // computed in WQM. |
||
1831 | def int_amdgcn_set_inactive : |
||
1832 | Intrinsic<[llvm_anyint_ty], |
||
1833 | [LLVMMatchType<0>, // value to be copied |
||
1834 | LLVMMatchType<0>], // value for the inactive lanes to take |
||
1835 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1836 | |||
1837 | // Return if the given flat pointer points to a local memory address. |
||
1838 | def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, |
||
1839 | DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], |
||
1840 | [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] |
||
1841 | >; |
||
1842 | |||
1843 | // Return if the given flat pointer points to a prvate memory address. |
||
1844 | def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, |
||
1845 | DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], |
||
1846 | [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] |
||
1847 | >; |
||
1848 | |||
1849 | //===----------------------------------------------------------------------===// |
||
1850 | // CI+ Intrinsics |
||
1851 | //===----------------------------------------------------------------------===// |
||
1852 | |||
1853 | def int_amdgcn_s_dcache_inv_vol : |
||
1854 | ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, |
||
1855 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
1856 | |||
1857 | def int_amdgcn_buffer_wbinvl1_vol : |
||
1858 | ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, |
||
1859 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
1860 | |||
1861 | //===----------------------------------------------------------------------===// |
||
1862 | // VI Intrinsics |
||
1863 | //===----------------------------------------------------------------------===// |
||
1864 | |||
1865 | // llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> |
||
1866 | def int_amdgcn_mov_dpp : |
||
1867 | Intrinsic<[llvm_anyint_ty], |
||
1868 | [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, |
||
1869 | llvm_i1_ty], |
||
1870 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
1871 | ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, |
||
1872 | ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; |
||
1873 | |||
1874 | // llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> |
||
1875 | // Should be equivalent to: |
||
1876 | // v_mov_b32 <dest> <old> |
||
1877 | // v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> |
||
1878 | def int_amdgcn_update_dpp : |
||
1879 | Intrinsic<[llvm_anyint_ty], |
||
1880 | [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, |
||
1881 | llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], |
||
1882 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
1883 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, |
||
1884 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; |
||
1885 | |||
1886 | def int_amdgcn_s_dcache_wb : |
||
1887 | ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, |
||
1888 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1889 | |||
1890 | def int_amdgcn_s_dcache_wb_vol : |
||
1891 | ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, |
||
1892 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1893 | |||
1894 | def int_amdgcn_s_memrealtime : |
||
1895 | ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, |
||
1896 | Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1897 | |||
1898 | // llvm.amdgcn.ds.permute <index> <src> |
||
1899 | def int_amdgcn_ds_permute : |
||
1900 | ClangBuiltin<"__builtin_amdgcn_ds_permute">, |
||
1901 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1902 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1903 | |||
1904 | // llvm.amdgcn.ds.bpermute <index> <src> |
||
1905 | def int_amdgcn_ds_bpermute : |
||
1906 | ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, |
||
1907 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1908 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1909 | |||
1910 | // llvm.amdgcn.perm <src0> <src1> <selector> |
||
1911 | def int_amdgcn_perm : |
||
1912 | ClangBuiltin<"__builtin_amdgcn_perm">, |
||
1913 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
1914 | [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1915 | |||
1916 | //===----------------------------------------------------------------------===// |
||
1917 | // GFX9 Intrinsics |
||
1918 | //===----------------------------------------------------------------------===// |
||
1919 | |||
1920 | class AMDGPUGlobalLoadLDS : Intrinsic < |
||
1921 | [], |
||
1922 | [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from |
||
1923 | LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to |
||
1924 | llvm_i32_ty, // Data byte size: 1/2/4 |
||
1925 | llvm_i32_ty, // imm offset (applied to both global and LDS address) |
||
1926 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, |
||
1927 | // bit 1 = slc/sc1, |
||
1928 | // bit 2 = dlc on gfx10+)) |
||
1929 | // bit 4 = scc/nt on gfx90a+)) |
||
1930 | [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, |
||
1931 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], |
||
1932 | "", [SDNPMemOperand]>; |
||
1933 | def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; |
||
1934 | |||
1935 | //===----------------------------------------------------------------------===// |
||
1936 | // GFX10 Intrinsics |
||
1937 | //===----------------------------------------------------------------------===// |
||
1938 | |||
1939 | // llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> |
||
1940 | def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, |
||
1941 | Intrinsic<[llvm_i32_ty], |
||
1942 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], |
||
1943 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
1944 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; |
||
1945 | |||
1946 | // llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> |
||
1947 | def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, |
||
1948 | Intrinsic<[llvm_i32_ty], |
||
1949 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], |
||
1950 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
1951 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; |
||
1952 | |||
1953 | // llvm.amdgcn.mov.dpp8.i32 <src> <sel> |
||
1954 | // <sel> is a 32-bit constant whose high 8 bits must be zero which selects |
||
1955 | // the lanes to read from. |
||
1956 | def int_amdgcn_mov_dpp8 : |
||
1957 | Intrinsic<[llvm_anyint_ty], |
||
1958 | [LLVMMatchType<0>, llvm_i32_ty], |
||
1959 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
1960 | ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; |
||
1961 | |||
1962 | def int_amdgcn_s_get_waveid_in_workgroup : |
||
1963 | ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, |
||
1964 | Intrinsic<[llvm_i32_ty], [], |
||
1965 | [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1966 | |||
1967 | class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic < |
||
1968 | [vt], |
||
1969 | [llvm_anyptr_ty, // vaddr |
||
1970 | vt], // vdata(VGPR) |
||
1971 | [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", |
||
1972 | [SDNPMemOperand]>; |
||
1973 | |||
1974 | def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>; |
||
1975 | |||
1976 | // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, |
||
1977 | // <ray_dir>, <ray_inv_dir>, <texture_descr> |
||
1978 | // <node_ptr> is i32 or i64. |
||
1979 | // <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. |
||
1980 | def int_amdgcn_image_bvh_intersect_ray : |
||
1981 | DefaultAttrsIntrinsic<[llvm_v4i32_ty], |
||
1982 | [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, |
||
1983 | LLVMMatchType<1>, llvm_v4i32_ty], |
||
1984 | [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1985 | |||
1986 | //===----------------------------------------------------------------------===// |
||
1987 | // GFX11 Intrinsics |
||
1988 | //===----------------------------------------------------------------------===// |
||
1989 | |||
1990 | // llvm.amdgcn.permlane64 <src0> |
||
1991 | def int_amdgcn_permlane64 : |
||
1992 | ClangBuiltin<"__builtin_amdgcn_permlane64">, |
||
1993 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty], |
||
1994 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
1995 | |||
1996 | def int_amdgcn_ds_add_gs_reg_rtn : |
||
1997 | ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, |
||
1998 | Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], |
||
1999 | [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
2000 | |||
2001 | def int_amdgcn_ds_sub_gs_reg_rtn : |
||
2002 | ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, |
||
2003 | Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], |
||
2004 | [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
2005 | |||
2006 | def int_amdgcn_ds_bvh_stack_rtn : |
||
2007 | Intrinsic< |
||
2008 | [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr |
||
2009 | [ |
||
2010 | llvm_i32_ty, // %addr |
||
2011 | llvm_i32_ty, // %data0 |
||
2012 | llvm_v4i32_ty, // %data1 |
||
2013 | llvm_i32_ty, // %offset |
||
2014 | ], |
||
2015 | [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2016 | >; |
||
2017 | |||
2018 | // WMMA (Wave Matrix Multiply-Accumulate) intrinsics |
||
2019 | // |
||
2020 | // These operations perform a matrix multiplication and accumulation of |
||
2021 | // the form: D = A * B + C . |
||
2022 | |||
2023 | class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : |
||
2024 | Intrinsic< |
||
2025 | [CD], // %D |
||
2026 | [ |
||
2027 | AB, // %A |
||
2028 | AB, // %B |
||
2029 | LLVMMatchType<0>, // %C |
||
2030 | ], |
||
2031 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2032 | >; |
||
2033 | |||
2034 | class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : |
||
2035 | Intrinsic< |
||
2036 | [CD], // %D |
||
2037 | [ |
||
2038 | AB, // %A |
||
2039 | AB, // %B |
||
2040 | LLVMMatchType<0>, // %C |
||
2041 | llvm_i1_ty, // %high |
||
2042 | ], |
||
2043 | [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2044 | >; |
||
2045 | |||
2046 | class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : |
||
2047 | Intrinsic< |
||
2048 | [CD], // %D |
||
2049 | [ |
||
2050 | llvm_i1_ty, // %A_sign |
||
2051 | AB, // %A |
||
2052 | llvm_i1_ty, // %B_sign |
||
2053 | AB, // %B |
||
2054 | LLVMMatchType<0>, // %C |
||
2055 | llvm_i1_ty, // %clamp |
||
2056 | ], |
||
2057 | [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2058 | >; |
||
2059 | |||
2060 | def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>; |
||
2061 | def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>; |
||
2062 | def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; |
||
2063 | def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; |
||
2064 | def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>; |
||
2065 | def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>; |
||
2066 | |||
2067 | def int_amdgcn_s_wait_event_export_ready : |
||
2068 | ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, |
||
2069 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] |
||
2070 | >; |
||
2071 | |||
2072 | //===----------------------------------------------------------------------===// |
||
2073 | // Deep learning intrinsics. |
||
2074 | //===----------------------------------------------------------------------===// |
||
2075 | |||
2076 | // f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) |
||
2077 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
2078 | def int_amdgcn_fdot2 : |
||
2079 | ClangBuiltin<"__builtin_amdgcn_fdot2">, |
||
2080 | DefaultAttrsIntrinsic< |
||
2081 | [llvm_float_ty], // %r |
||
2082 | [ |
||
2083 | llvm_v2f16_ty, // %a |
||
2084 | llvm_v2f16_ty, // %b |
||
2085 | llvm_float_ty, // %c |
||
2086 | llvm_i1_ty // %clamp |
||
2087 | ], |
||
2088 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2089 | >; |
||
2090 | |||
2091 | // f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) |
||
2092 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
2093 | def int_amdgcn_fdot2_f16_f16 : |
||
2094 | ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, |
||
2095 | DefaultAttrsIntrinsic< |
||
2096 | [llvm_half_ty], // %r |
||
2097 | [ |
||
2098 | llvm_v2f16_ty, // %a |
||
2099 | llvm_v2f16_ty, // %b |
||
2100 | llvm_half_ty // %c |
||
2101 | ], |
||
2102 | [IntrNoMem, IntrSpeculatable] |
||
2103 | >; |
||
2104 | |||
2105 | // bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) |
||
2106 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
2107 | def int_amdgcn_fdot2_bf16_bf16 : |
||
2108 | ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, |
||
2109 | DefaultAttrsIntrinsic< |
||
2110 | [llvm_i16_ty], // %r |
||
2111 | [ |
||
2112 | llvm_v2i16_ty, // %a |
||
2113 | llvm_v2i16_ty, // %b |
||
2114 | llvm_i16_ty // %c |
||
2115 | ], |
||
2116 | [IntrNoMem, IntrSpeculatable] |
||
2117 | >; |
||
2118 | |||
2119 | // f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) |
||
2120 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
2121 | def int_amdgcn_fdot2_f32_bf16 : |
||
2122 | ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, |
||
2123 | DefaultAttrsIntrinsic< |
||
2124 | [llvm_float_ty], // %r |
||
2125 | [ |
||
2126 | llvm_v2i16_ty, // %a |
||
2127 | llvm_v2i16_ty, // %b |
||
2128 | llvm_float_ty, // %c |
||
2129 | llvm_i1_ty // %clamp |
||
2130 | ], |
||
2131 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2132 | >; |
||
2133 | |||
2134 | // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) |
||
2135 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
2136 | def int_amdgcn_sdot2 : |
||
2137 | ClangBuiltin<"__builtin_amdgcn_sdot2">, |
||
2138 | DefaultAttrsIntrinsic< |
||
2139 | [llvm_i32_ty], // %r |
||
2140 | [ |
||
2141 | llvm_v2i16_ty, // %a |
||
2142 | llvm_v2i16_ty, // %b |
||
2143 | llvm_i32_ty, // %c |
||
2144 | llvm_i1_ty // %clamp |
||
2145 | ], |
||
2146 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2147 | >; |
||
2148 | |||
2149 | // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) |
||
2150 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
2151 | def int_amdgcn_udot2 : |
||
2152 | ClangBuiltin<"__builtin_amdgcn_udot2">, |
||
2153 | DefaultAttrsIntrinsic< |
||
2154 | [llvm_i32_ty], // %r |
||
2155 | [ |
||
2156 | llvm_v2i16_ty, // %a |
||
2157 | llvm_v2i16_ty, // %b |
||
2158 | llvm_i32_ty, // %c |
||
2159 | llvm_i1_ty // %clamp |
||
2160 | ], |
||
2161 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2162 | >; |
||
2163 | |||
2164 | // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) |
||
2165 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c |
||
2166 | def int_amdgcn_sdot4 : |
||
2167 | ClangBuiltin<"__builtin_amdgcn_sdot4">, |
||
2168 | DefaultAttrsIntrinsic< |
||
2169 | [llvm_i32_ty], // %r |
||
2170 | [ |
||
2171 | llvm_i32_ty, // %a |
||
2172 | llvm_i32_ty, // %b |
||
2173 | llvm_i32_ty, // %c |
||
2174 | llvm_i1_ty // %clamp |
||
2175 | ], |
||
2176 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2177 | >; |
||
2178 | |||
2179 | // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) |
||
2180 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c |
||
2181 | def int_amdgcn_udot4 : |
||
2182 | ClangBuiltin<"__builtin_amdgcn_udot4">, |
||
2183 | DefaultAttrsIntrinsic< |
||
2184 | [llvm_i32_ty], // %r |
||
2185 | [ |
||
2186 | llvm_i32_ty, // %a |
||
2187 | llvm_i32_ty, // %b |
||
2188 | llvm_i32_ty, // %c |
||
2189 | llvm_i1_ty // %clamp |
||
2190 | ], |
||
2191 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2192 | >; |
||
2193 | |||
2194 | // i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) |
||
2195 | // Treat input as signed (_sign = 1) or unsigned (_sign = 0). |
||
2196 | // a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); |
||
2197 | // b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); |
||
2198 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c |
||
2199 | def int_amdgcn_sudot4 : |
||
2200 | ClangBuiltin<"__builtin_amdgcn_sudot4">, |
||
2201 | DefaultAttrsIntrinsic< |
||
2202 | [llvm_i32_ty], // %r |
||
2203 | [ |
||
2204 | llvm_i1_ty, // %a_sign |
||
2205 | llvm_i32_ty, // %a |
||
2206 | llvm_i1_ty, // %b_sign |
||
2207 | llvm_i32_ty, // %b |
||
2208 | llvm_i32_ty, // %c |
||
2209 | llvm_i1_ty // %clamp |
||
2210 | ], |
||
2211 | [IntrNoMem, IntrSpeculatable, |
||
2212 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] |
||
2213 | >; |
||
2214 | |||
2215 | // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) |
||
2216 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + |
||
2217 | // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c |
||
2218 | def int_amdgcn_sdot8 : |
||
2219 | ClangBuiltin<"__builtin_amdgcn_sdot8">, |
||
2220 | DefaultAttrsIntrinsic< |
||
2221 | [llvm_i32_ty], // %r |
||
2222 | [ |
||
2223 | llvm_i32_ty, // %a |
||
2224 | llvm_i32_ty, // %b |
||
2225 | llvm_i32_ty, // %c |
||
2226 | llvm_i1_ty // %clamp |
||
2227 | ], |
||
2228 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2229 | >; |
||
2230 | |||
2231 | // u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) |
||
2232 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + |
||
2233 | // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c |
||
2234 | def int_amdgcn_udot8 : |
||
2235 | ClangBuiltin<"__builtin_amdgcn_udot8">, |
||
2236 | DefaultAttrsIntrinsic< |
||
2237 | [llvm_i32_ty], // %r |
||
2238 | [ |
||
2239 | llvm_i32_ty, // %a |
||
2240 | llvm_i32_ty, // %b |
||
2241 | llvm_i32_ty, // %c |
||
2242 | llvm_i1_ty // %clamp |
||
2243 | ], |
||
2244 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
2245 | >; |
||
2246 | |||
2247 | // i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) |
||
2248 | // Treat input as signed (_sign = 1) or unsigned (_sign = 0). |
||
2249 | // a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); |
||
2250 | // b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); |
||
2251 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + |
||
2252 | // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c |
||
2253 | def int_amdgcn_sudot8 : |
||
2254 | ClangBuiltin<"__builtin_amdgcn_sudot8">, |
||
2255 | DefaultAttrsIntrinsic< |
||
2256 | [llvm_i32_ty], // %r |
||
2257 | [ |
||
2258 | llvm_i1_ty, // %a_sign |
||
2259 | llvm_i32_ty, // %a |
||
2260 | llvm_i1_ty, // %b_sign |
||
2261 | llvm_i32_ty, // %b |
||
2262 | llvm_i32_ty, // %c |
||
2263 | llvm_i1_ty // %clamp |
||
2264 | ], |
||
2265 | [IntrNoMem, IntrSpeculatable, |
||
2266 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] |
||
2267 | >; |
||
2268 | |||
2269 | //===----------------------------------------------------------------------===// |
||
2270 | // gfx908 intrinsics |
||
2271 | // ===----------------------------------------------------------------------===// |
||
2272 | |||
2273 | def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
2274 | |||
2275 | // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp |
||
2276 | class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : |
||
2277 | ClangBuiltin<!subst("int", "__builtin", NAME)>, |
||
2278 | DefaultAttrsIntrinsic<[DestTy], |
||
2279 | [SrcABTy, SrcABTy, DestTy, |
||
2280 | llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
2281 | [IntrConvergent, IntrNoMem, |
||
2282 | ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; |
||
2283 | |||
2284 | def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; |
||
2285 | def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; |
||
2286 | def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; |
||
2287 | def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; |
||
2288 | def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; |
||
2289 | def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; |
||
2290 | def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; |
||
2291 | def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; |
||
2292 | def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; |
||
2293 | def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; |
||
2294 | def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; |
||
2295 | def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; |
||
2296 | def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; |
||
2297 | def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; |
||
2298 | def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; |
||
2299 | def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; |
||
2300 | def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; |
||
2301 | def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; |
||
2302 | def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; |
||
2303 | def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; |
||
2304 | |||
2305 | //===----------------------------------------------------------------------===// |
||
2306 | // gfx90a intrinsics |
||
2307 | // ===----------------------------------------------------------------------===// |
||
2308 | |||
2309 | def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
2310 | def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
2311 | def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
2312 | def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
2313 | def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
2314 | |||
2315 | def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; |
||
2316 | def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; |
||
2317 | def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; |
||
2318 | def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; |
||
2319 | def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; |
||
2320 | |||
2321 | // Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. |
||
2322 | // Three bits corresponding to the neg modifier applied to the respective |
||
2323 | // source operand. |
||
2324 | def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; |
||
2325 | def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; |
||
2326 | |||
2327 | //===----------------------------------------------------------------------===// |
||
2328 | // gfx940 intrinsics |
||
2329 | // ===----------------------------------------------------------------------===// |
||
2330 | |||
2331 | // bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. |
||
2332 | def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; |
||
2333 | def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; |
||
2334 | def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< |
||
2335 | [llvm_v2i16_ty], |
||
2336 | [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty], |
||
2337 | [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>, |
||
2338 | ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; |
||
2339 | |||
2340 | def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; |
||
2341 | def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; |
||
2342 | def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; |
||
2343 | def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; |
||
2344 | |||
2345 | class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : |
||
2346 | AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; |
||
2347 | |||
2348 | multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { |
||
2349 | foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in |
||
2350 | def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; |
||
2351 | } |
||
2352 | |||
2353 | defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; |
||
2354 | defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; |
||
2355 | |||
2356 | // llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid |
||
2357 | class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : |
||
2358 | ClangBuiltin<!subst("int", "__builtin", NAME)>, |
||
2359 | DefaultAttrsIntrinsic<[DestTy], |
||
2360 | [SrcA, SrcB, DestTy, llvm_i32_ty, |
||
2361 | llvm_i32_ty, llvm_i32_ty], |
||
2362 | [IntrConvergent, IntrNoMem, |
||
2363 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; |
||
2364 | |||
2365 | def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; |
||
2366 | def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; |
||
2367 | def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; |
||
2368 | def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; |
||
2369 | def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; |
||
2370 | def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; |
||
2371 | |||
2372 | class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : |
||
2373 | AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; |
||
2374 | |||
2375 | multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { |
||
2376 | foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in |
||
2377 | def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; |
||
2378 | } |
||
2379 | |||
2380 | defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; |
||
2381 | defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; |
||
2382 | |||
2383 | // llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] |
||
2384 | // byte_sel selects byte from srcA. |
||
2385 | def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, |
||
2386 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
2387 | [llvm_i32_ty, llvm_i32_ty], |
||
2388 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
2389 | |||
2390 | // llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] |
||
2391 | def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, |
||
2392 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
2393 | [llvm_i32_ty, llvm_i32_ty], |
||
2394 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
2395 | |||
2396 | // llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel |
||
2397 | // word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. |
||
2398 | def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, |
||
2399 | DefaultAttrsIntrinsic<[llvm_v2f32_ty], |
||
2400 | [llvm_i32_ty, llvm_i1_ty], |
||
2401 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
2402 | |||
2403 | // llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. |
||
2404 | def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, |
||
2405 | DefaultAttrsIntrinsic<[llvm_v2f32_ty], |
||
2406 | [llvm_i32_ty, llvm_i1_ty], |
||
2407 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
2408 | |||
2409 | // llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel |
||
2410 | // word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. |
||
2411 | def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, |
||
2412 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
2413 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], |
||
2414 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
2415 | |||
2416 | // llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel |
||
2417 | def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, |
||
2418 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
2419 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], |
||
2420 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
2421 | |||
2422 | // llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] |
||
2423 | // byte_sel selects byte to write into vdst. |
||
2424 | def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, |
||
2425 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
2426 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
2427 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
2428 | |||
2429 | // llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] |
||
2430 | def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, |
||
2431 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
2432 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
2433 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
2434 | |||
2435 | // Represent a relocation constant. |
||
2436 | def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< |
||
2437 | [llvm_i32_ty], [llvm_metadata_ty], |
||
2438 | [IntrNoMem, IntrSpeculatable] |
||
2439 | >; |
||
2440 | |||
2441 | //===----------------------------------------------------------------------===// |
||
2442 | // Special Intrinsics for backend internal use only. No frontend |
||
2443 | // should emit calls to these. |
||
2444 | // ===----------------------------------------------------------------------===// |
||
2445 | def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], |
||
2446 | [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2447 | >; |
||
2448 | |||
2449 | def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], |
||
2450 | [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2451 | >; |
||
2452 | |||
2453 | def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], |
||
2454 | [llvm_i1_ty, LLVMMatchType<0>], |
||
2455 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2456 | >; |
||
2457 | |||
2458 | def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], |
||
2459 | [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
2460 | >; |
||
2461 | |||
2462 | def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], |
||
2463 | [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
2464 | |||
2465 | // Represent unreachable in a divergent region. |
||
2466 | def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; |
||
2467 | |||
2468 | // Emit 2.5 ulp, no denormal division. Should only be inserted by |
||
2469 | // pass based on !fpmath metadata. |
||
2470 | def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< |
||
2471 | [llvm_float_ty], [llvm_float_ty, llvm_float_ty], |
||
2472 | [IntrNoMem, IntrSpeculatable] |
||
2473 | >; |
||
2474 | } |