Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 14 | pmbaty | 1 | //===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// |
| 2 | // |
||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
| 6 | // |
||
| 7 | //===----------------------------------------------------------------------===// |
||
| 8 | // |
||
| 9 | // This file defines all of the R600-specific intrinsics. |
||
| 10 | // |
||
| 11 | //===----------------------------------------------------------------------===// |
||
| 12 | |||
| 13 | class AMDGPUReadPreloadRegisterIntrinsic |
||
| 14 | : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
| 15 | |||
| 16 | class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> |
||
| 17 | : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; |
||
| 18 | |||
| 19 | // Used to tag image and resource intrinsics with information used to generate |
||
| 20 | // mem operands. |
||
| 21 | class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { |
||
| 22 | int RsrcArg = rsrcarg; |
||
| 23 | bit IsImage = isimage; |
||
| 24 | } |
||
| 25 | |||
| 26 | let TargetPrefix = "r600" in { |
||
| 27 | |||
| 28 | multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { |
||
| 29 | def _x : AMDGPUReadPreloadRegisterIntrinsic; |
||
| 30 | def _y : AMDGPUReadPreloadRegisterIntrinsic; |
||
| 31 | def _z : AMDGPUReadPreloadRegisterIntrinsic; |
||
| 32 | } |
||
| 33 | |||
| 34 | multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { |
||
| 35 | def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; |
||
| 36 | def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; |
||
| 37 | def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; |
||
| 38 | } |
||
| 39 | |||
| 40 | defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
| 41 | <"__builtin_r600_read_global_size">; |
||
| 42 | defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
| 43 | <"__builtin_r600_read_ngroups">; |
||
| 44 | defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
| 45 | <"__builtin_r600_read_tgid">; |
||
| 46 | |||
| 47 | defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; |
||
| 48 | defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; |
||
| 49 | |||
| 50 | def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, |
||
| 51 | Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; |
||
| 52 | |||
| 53 | // AS 7 is PARAM_I_ADDRESS, used for kernel arguments |
||
| 54 | def int_r600_implicitarg_ptr : |
||
| 55 | ClangBuiltin<"__builtin_r600_implicitarg_ptr">, |
||
| 56 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [], |
||
| 57 | [IntrNoMem, IntrSpeculatable]>; |
||
| 58 | |||
| 59 | def int_r600_rat_store_typed : |
||
| 60 | // 1st parameter: Data |
||
| 61 | // 2nd parameter: Index |
||
| 62 | // 3rd parameter: Constant RAT ID |
||
| 63 | DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, |
||
| 64 | ClangBuiltin<"__builtin_r600_rat_store_typed">; |
||
| 65 | |||
| 66 | def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< |
||
| 67 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 68 | >; |
||
| 69 | |||
| 70 | def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< |
||
| 71 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 72 | >; |
||
| 73 | |||
| 74 | def int_r600_cube : DefaultAttrsIntrinsic< |
||
| 75 | [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] |
||
| 76 | >; |
||
| 77 | |||
| 78 | def int_r600_store_stream_output : DefaultAttrsIntrinsic< |
||
| 79 | [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] |
||
| 80 | >; |
||
| 81 | |||
| 82 | class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ |
||
| 83 | llvm_v4f32_ty, // Coord |
||
| 84 | llvm_i32_ty, // offset_x |
||
| 85 | llvm_i32_ty, // offset_y, |
||
| 86 | llvm_i32_ty, // offset_z, |
||
| 87 | llvm_i32_ty, // resource_id |
||
| 88 | llvm_i32_ty, // samplerid |
||
| 89 | llvm_i32_ty, // coord_type_x |
||
| 90 | llvm_i32_ty, // coord_type_y |
||
| 91 | llvm_i32_ty, // coord_type_z |
||
| 92 | llvm_i32_ty], // coord_type_w |
||
| 93 | [IntrNoMem] |
||
| 94 | >; |
||
| 95 | |||
| 96 | class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ |
||
| 97 | llvm_v4i32_ty, // Coord |
||
| 98 | llvm_i32_ty, // offset_x |
||
| 99 | llvm_i32_ty, // offset_y, |
||
| 100 | llvm_i32_ty, // offset_z, |
||
| 101 | llvm_i32_ty, // resource_id |
||
| 102 | llvm_i32_ty, // samplerid |
||
| 103 | llvm_i32_ty, // coord_type_x |
||
| 104 | llvm_i32_ty, // coord_type_y |
||
| 105 | llvm_i32_ty, // coord_type_z |
||
| 106 | llvm_i32_ty], // coord_type_w |
||
| 107 | [IntrNoMem] |
||
| 108 | >; |
||
| 109 | |||
| 110 | def int_r600_store_swizzle : |
||
| 111 | Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 112 | >; |
||
| 113 | |||
| 114 | def int_r600_tex : TextureIntrinsicFloatInput; |
||
| 115 | def int_r600_texc : TextureIntrinsicFloatInput; |
||
| 116 | def int_r600_txl : TextureIntrinsicFloatInput; |
||
| 117 | def int_r600_txlc : TextureIntrinsicFloatInput; |
||
| 118 | def int_r600_txb : TextureIntrinsicFloatInput; |
||
| 119 | def int_r600_txbc : TextureIntrinsicFloatInput; |
||
| 120 | def int_r600_txf : TextureIntrinsicInt32Input; |
||
| 121 | def int_r600_txq : TextureIntrinsicInt32Input; |
||
| 122 | def int_r600_ddx : TextureIntrinsicFloatInput; |
||
| 123 | def int_r600_ddy : TextureIntrinsicFloatInput; |
||
| 124 | |||
| 125 | def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 126 | [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] |
||
| 127 | >; |
||
| 128 | |||
| 129 | def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; |
||
| 130 | |||
| 131 | } // End TargetPrefix = "r600" |
||
| 132 | |||
| 133 | let TargetPrefix = "amdgcn" in { |
||
| 134 | |||
| 135 | //===----------------------------------------------------------------------===// |
||
| 136 | // ABI Special Intrinsics |
||
| 137 | //===----------------------------------------------------------------------===// |
||
| 138 | |||
| 139 | defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; |
||
| 140 | defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named |
||
| 141 | <"__builtin_amdgcn_workgroup_id">; |
||
| 142 | |||
| 143 | def int_amdgcn_dispatch_ptr : |
||
| 144 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
| 145 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
| 146 | |||
| 147 | def int_amdgcn_queue_ptr : |
||
| 148 | ClangBuiltin<"__builtin_amdgcn_queue_ptr">, |
||
| 149 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
| 150 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
| 151 | |||
| 152 | def int_amdgcn_kernarg_segment_ptr : |
||
| 153 | ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, |
||
| 154 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
| 155 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
| 156 | |||
| 157 | def int_amdgcn_implicitarg_ptr : |
||
| 158 | ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, |
||
| 159 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
| 160 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
| 161 | |||
| 162 | def int_amdgcn_groupstaticsize : |
||
| 163 | ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, |
||
| 164 | DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
| 165 | |||
| 166 | def int_amdgcn_dispatch_id : |
||
| 167 | ClangBuiltin<"__builtin_amdgcn_dispatch_id">, |
||
| 168 | DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
| 169 | |||
| 170 | // For internal use. Coordinates LDS lowering between IR transform and backend. |
||
| 171 | def int_amdgcn_lds_kernel_id : |
||
| 172 | DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
| 173 | |||
| 174 | def int_amdgcn_implicit_buffer_ptr : |
||
| 175 | ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, |
||
| 176 | DefaultAttrsIntrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], |
||
| 177 | [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; |
||
| 178 | |||
| 179 | // Set EXEC to the 64-bit value given. |
||
| 180 | // This is always moved to the beginning of the basic block. |
||
| 181 | // FIXME: Should be mangled for wave size. |
||
| 182 | def int_amdgcn_init_exec : Intrinsic<[], |
||
| 183 | [llvm_i64_ty], // 64-bit literal constant |
||
| 184 | [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, |
||
| 185 | IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; |
||
| 186 | |||
| 187 | // Set EXEC according to a thread count packed in an SGPR input: |
||
| 188 | // thread_count = (input >> bitoffset) & 0x7f; |
||
| 189 | // This is always moved to the beginning of the basic block. |
||
| 190 | // Note: only inreg arguments to the parent function are valid as |
||
| 191 | // inputs to this intrinsic, computed values cannot be used. |
||
| 192 | def int_amdgcn_init_exec_from_input : Intrinsic<[], |
||
| 193 | [llvm_i32_ty, // 32-bit SGPR input |
||
| 194 | llvm_i32_ty], // bit offset of the thread count |
||
| 195 | [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, |
||
| 196 | IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; |
||
| 197 | |||
| 198 | def int_amdgcn_wavefrontsize : |
||
| 199 | ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, |
||
| 200 | DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; |
||
| 201 | |||
| 202 | |||
| 203 | //===----------------------------------------------------------------------===// |
||
| 204 | // Instruction Intrinsics |
||
| 205 | //===----------------------------------------------------------------------===// |
||
| 206 | |||
| 207 | // The first parameter is s_sendmsg immediate (i16), |
||
| 208 | // the second one is copied to m0 |
||
| 209 | def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, |
||
| 210 | Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], |
||
| 211 | [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; |
||
| 212 | def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, |
||
| 213 | Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], |
||
| 214 | [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; |
||
| 215 | |||
| 216 | |||
| 217 | // gfx11 intrinsic |
||
| 218 | // The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. |
||
| 219 | def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], |
||
| 220 | [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; |
||
| 221 | |||
| 222 | def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, |
||
| 223 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 224 | |||
| 225 | def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, |
||
| 226 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 227 | |||
| 228 | // The 1st parameter is a mask for the types of instructions that may be allowed |
||
| 229 | // to cross the SCHED_BARRIER during scheduling. |
||
| 230 | // MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. |
||
| 231 | // MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be |
||
| 232 | // scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. |
||
| 233 | // MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. |
||
| 234 | // MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. |
||
| 235 | // MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. |
||
| 236 | // MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. |
||
| 237 | // MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. |
||
| 238 | // MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. |
||
| 239 | // MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. |
||
| 240 | // MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. |
||
| 241 | // MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. |
||
| 242 | def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, |
||
| 243 | Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, |
||
| 244 | IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 245 | |||
| 246 | // The first parameter is a mask that determines the types of instructions that |
||
| 247 | // you would like to synchronize around and add to a scheduling group. The |
||
| 248 | // values of the mask are defined above for sched_barrier. These instructions |
||
| 249 | // will be selected from the bottom up starting from the sched_group_barrier's |
||
| 250 | // location during instruction scheduling. The second parameter is the number of |
||
| 251 | // matching instructions that will be associated with this sched_group_barrier. |
||
| 252 | // The third parameter is an identifier which is used to describe what other |
||
| 253 | // sched_group_barriers should be synchronized with. |
||
| 254 | def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, |
||
| 255 | Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 256 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, |
||
| 257 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 258 | |||
| 259 | // Scheduler optimization hint. |
||
| 260 | // MASK = 0: Small gemm opt |
||
| 261 | def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, |
||
| 262 | Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, |
||
| 263 | IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 264 | |||
| 265 | def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, |
||
| 266 | Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 267 | |||
| 268 | def int_amdgcn_div_scale : DefaultAttrsIntrinsic< |
||
| 269 | // 1st parameter: Numerator |
||
| 270 | // 2nd parameter: Denominator |
||
| 271 | // 3rd parameter: Select quotient. Must equal Numerator or Denominator. |
||
| 272 | // (0 = Denominator, 1 = Numerator). |
||
| 273 | [llvm_anyfloat_ty, llvm_i1_ty], |
||
| 274 | [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], |
||
| 275 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] |
||
| 276 | >; |
||
| 277 | |||
| 278 | def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
| 279 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], |
||
| 280 | [IntrNoMem, IntrSpeculatable] |
||
| 281 | >; |
||
| 282 | |||
| 283 | def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
| 284 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], |
||
| 285 | [IntrNoMem, IntrSpeculatable] |
||
| 286 | >; |
||
| 287 | |||
| 288 | // Look Up 2.0 / pi src0 with segment select src1[4:0] |
||
| 289 | def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< |
||
| 290 | [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], |
||
| 291 | [IntrNoMem, IntrSpeculatable] |
||
| 292 | >; |
||
| 293 | |||
| 294 | def int_amdgcn_sin : DefaultAttrsIntrinsic< |
||
| 295 | [llvm_anyfloat_ty], [LLVMMatchType<0>], |
||
| 296 | [IntrNoMem, IntrSpeculatable] |
||
| 297 | >; |
||
| 298 | |||
| 299 | def int_amdgcn_cos : DefaultAttrsIntrinsic< |
||
| 300 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 301 | >; |
||
| 302 | |||
| 303 | def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< |
||
| 304 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 305 | >; |
||
| 306 | |||
| 307 | def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, |
||
| 308 | DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], |
||
| 309 | [IntrNoMem, IntrSpeculatable, Commutative] |
||
| 310 | >; |
||
| 311 | |||
| 312 | // Fused single-precision multiply-add with legacy behaviour for the multiply, |
||
| 313 | // which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is |
||
| 314 | // intended for use on subtargets that have the v_fma_legacy_f32 and/or |
||
| 315 | // v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and |
||
| 316 | // has a completely different kind of legacy behaviour.) |
||
| 317 | def int_amdgcn_fma_legacy : |
||
| 318 | DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 319 | [IntrNoMem, IntrSpeculatable, Commutative] |
||
| 320 | >; |
||
| 321 | |||
| 322 | def int_amdgcn_rcp : DefaultAttrsIntrinsic< |
||
| 323 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 324 | >; |
||
| 325 | |||
| 326 | def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, |
||
| 327 | DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], |
||
| 328 | [IntrNoMem, IntrSpeculatable] |
||
| 329 | >; |
||
| 330 | |||
| 331 | def int_amdgcn_sqrt : DefaultAttrsIntrinsic< |
||
| 332 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 333 | >; |
||
| 334 | |||
| 335 | def int_amdgcn_rsq : DefaultAttrsIntrinsic< |
||
| 336 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 337 | >; |
||
| 338 | |||
| 339 | def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, |
||
| 340 | DefaultAttrsIntrinsic< |
||
| 341 | [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] |
||
| 342 | >; |
||
| 343 | |||
| 344 | // out = 1.0 / sqrt(a) result clamped to +/- max_float. |
||
| 345 | def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< |
||
| 346 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; |
||
| 347 | |||
| 348 | // For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used. |
||
| 349 | def int_amdgcn_ldexp : DefaultAttrsIntrinsic< |
||
| 350 | [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], |
||
| 351 | [IntrNoMem, IntrSpeculatable] |
||
| 352 | >; |
||
| 353 | |||
| 354 | def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< |
||
| 355 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 356 | >; |
||
| 357 | |||
| 358 | def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< |
||
| 359 | [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] |
||
| 360 | >; |
||
| 361 | |||
| 362 | // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 |
||
| 363 | // and always uses rtz, so is not suitable for implementing the OpenCL |
||
| 364 | // fract function. It should be ok on VI. |
||
| 365 | def int_amdgcn_fract : DefaultAttrsIntrinsic< |
||
| 366 | [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] |
||
| 367 | >; |
||
| 368 | |||
| 369 | def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, |
||
| 370 | DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], |
||
| 371 | [IntrNoMem, IntrSpeculatable] |
||
| 372 | >; |
||
| 373 | |||
| 374 | def int_amdgcn_cvt_pknorm_i16 : |
||
| 375 | ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, |
||
| 376 | DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], |
||
| 377 | [IntrNoMem, IntrSpeculatable] |
||
| 378 | >; |
||
| 379 | |||
| 380 | def int_amdgcn_cvt_pknorm_u16 : |
||
| 381 | ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, |
||
| 382 | DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], |
||
| 383 | [IntrNoMem, IntrSpeculatable] |
||
| 384 | >; |
||
| 385 | |||
| 386 | def int_amdgcn_cvt_pk_i16 : |
||
| 387 | ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, |
||
| 388 | DefaultAttrsIntrinsic< |
||
| 389 | [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 390 | [IntrNoMem, IntrSpeculatable] |
||
| 391 | >; |
||
| 392 | |||
| 393 | def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, |
||
| 394 | DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 395 | [IntrNoMem, IntrSpeculatable] |
||
| 396 | >; |
||
| 397 | |||
| 398 | def int_amdgcn_class : DefaultAttrsIntrinsic< |
||
| 399 | [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], |
||
| 400 | [IntrNoMem, IntrSpeculatable] |
||
| 401 | >; |
||
| 402 | |||
| 403 | def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">, |
||
| 404 | DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
| 405 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], |
||
| 406 | [IntrNoMem, IntrSpeculatable] |
||
| 407 | >; |
||
| 408 | |||
| 409 | def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, |
||
| 410 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 411 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 412 | [IntrNoMem, IntrSpeculatable] |
||
| 413 | >; |
||
| 414 | |||
| 415 | def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, |
||
| 416 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 417 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 418 | [IntrNoMem, IntrSpeculatable] |
||
| 419 | >; |
||
| 420 | |||
| 421 | def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, |
||
| 422 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 423 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 424 | [IntrNoMem, IntrSpeculatable] |
||
| 425 | >; |
||
| 426 | |||
| 427 | def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, |
||
| 428 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 429 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 430 | [IntrNoMem, IntrSpeculatable] |
||
| 431 | >; |
||
| 432 | |||
| 433 | // v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz |
||
| 434 | // should be used. |
||
| 435 | def int_amdgcn_sffbh : |
||
| 436 | DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], |
||
| 437 | [IntrNoMem, IntrSpeculatable] |
||
| 438 | >; |
||
| 439 | |||
| 440 | // v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. |
||
| 441 | def int_amdgcn_fmad_ftz : |
||
| 442 | DefaultAttrsIntrinsic<[llvm_anyfloat_ty], |
||
| 443 | [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], |
||
| 444 | [IntrNoMem, IntrSpeculatable] |
||
| 445 | >; |
||
| 446 | |||
| 447 | // Fields should mirror atomicrmw |
||
| 448 | class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], |
||
| 449 | [llvm_anyptr_ty, |
||
| 450 | LLVMMatchType<0>, |
||
| 451 | llvm_i32_ty, // ordering |
||
| 452 | llvm_i32_ty, // scope |
||
| 453 | llvm_i1_ty], // isVolatile |
||
| 454 | [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, |
||
| 455 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], "", |
||
| 456 | [SDNPMemOperand] |
||
| 457 | >; |
||
| 458 | |||
| 459 | def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; |
||
| 460 | def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; |
||
| 461 | |||
| 462 | class AMDGPULDSIntrin : |
||
| 463 | Intrinsic<[llvm_any_ty], |
||
| 464 | [LLVMQualPointerType<LLVMMatchType<0>, 3>, |
||
| 465 | LLVMMatchType<0>, |
||
| 466 | llvm_i32_ty, // ordering |
||
| 467 | llvm_i32_ty, // scope |
||
| 468 | llvm_i1_ty], // isVolatile |
||
| 469 | [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, |
||
| 470 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] |
||
| 471 | >; |
||
| 472 | |||
| 473 | // FIXME: The m0 argument should be moved after the normal arguments |
||
| 474 | class AMDGPUDSOrderedIntrinsic : Intrinsic< |
||
| 475 | [llvm_i32_ty], |
||
| 476 | // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that |
||
| 477 | // the bit packing can be optimized at the IR level. |
||
| 478 | [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0) |
||
| 479 | llvm_i32_ty, // value to add or swap |
||
| 480 | llvm_i32_ty, // ordering |
||
| 481 | llvm_i32_ty, // scope |
||
| 482 | llvm_i1_ty, // isVolatile |
||
| 483 | llvm_i32_ty, // ordered count index (OA index), also added to the address |
||
| 484 | // gfx10: bits 24-27 indicate the number of active threads/dwords |
||
| 485 | llvm_i1_ty, // wave release, usually set to 1 |
||
| 486 | llvm_i1_ty], // wave done, set to 1 for the last ordered instruction |
||
| 487 | [IntrWillReturn, NoCapture<ArgIndex<0>>, |
||
| 488 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, |
||
| 489 | ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree |
||
| 490 | ] |
||
| 491 | >; |
||
| 492 | |||
| 493 | class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< |
||
| 494 | [llvm_i32_ty], |
||
| 495 | [llvm_anyptr_ty, // LDS or GDS ptr |
||
| 496 | llvm_i1_ty], // isVolatile |
||
| 497 | [IntrConvergent, IntrWillReturn, IntrArgMemOnly, |
||
| 498 | NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], |
||
| 499 | "", |
||
| 500 | [SDNPMemOperand] |
||
| 501 | >; |
||
| 502 | |||
| 503 | def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; |
||
| 504 | def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; |
||
| 505 | |||
| 506 | // The pointer argument is assumed to be dynamically uniform if a VGPR. |
||
| 507 | def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; |
||
| 508 | def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; |
||
| 509 | |||
| 510 | def int_amdgcn_ds_fadd : AMDGPULDSIntrin; |
||
| 511 | def int_amdgcn_ds_fmin : AMDGPULDSIntrin; |
||
| 512 | def int_amdgcn_ds_fmax : AMDGPULDSIntrin; |
||
| 513 | |||
| 514 | } // TargetPrefix = "amdgcn" |
||
| 515 | |||
| 516 | // New-style image intrinsics |
||
| 517 | |||
| 518 | ////////////////////////////////////////////////////////////////////////// |
||
| 519 | // Dimension-aware image intrinsics framework |
||
| 520 | ////////////////////////////////////////////////////////////////////////// |
||
| 521 | |||
| 522 | // Helper class to represent (type, name) combinations of arguments. The |
||
| 523 | // argument names are explanatory and used as DAG operand names for codegen |
||
| 524 | // pattern matching. |
||
| 525 | class AMDGPUArg<LLVMType ty, string name> { |
||
| 526 | LLVMType Type = ty; |
||
| 527 | string Name = name; |
||
| 528 | } |
||
| 529 | |||
| 530 | // Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] |
||
| 531 | class makeArgList<list<string> names, LLVMType basety> { |
||
| 532 | list<AMDGPUArg> ret = |
||
| 533 | !listconcat([AMDGPUArg<basety, names[0]>], |
||
| 534 | !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); |
||
| 535 | } |
||
| 536 | |||
| 537 | // Return arglist, with LLVMMatchType's references shifted by 'shift'. |
||
| 538 | class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { |
||
| 539 | list<AMDGPUArg> ret = |
||
| 540 | !foreach(arg, arglist, |
||
| 541 | !if(!isa<LLVMMatchType>(arg.Type), |
||
| 542 | AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, |
||
| 543 | arg.Name>, |
||
| 544 | arg)); |
||
| 545 | } |
||
| 546 | |||
| 547 | // Return the concatenation of the given arglists. LLVMMatchType's are adjusted |
||
| 548 | // accordingly, and shifted by an additional 'shift'. |
||
| 549 | class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { |
||
| 550 | list<AMDGPUArg> ret = |
||
| 551 | !foldl([]<AMDGPUArg>, arglists, lhs, rhs, |
||
| 552 | !listconcat( |
||
| 553 | lhs, |
||
| 554 | arglistmatchshift<rhs, |
||
| 555 | !add(shift, !foldl(0, lhs, a, b, |
||
| 556 | !add(a, b.Type.isAny)))>.ret)); |
||
| 557 | } |
||
| 558 | |||
| 559 | // Represent texture/image types / dimensionality. |
||
| 560 | class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, |
||
| 561 | list<string> coord_names, list<string> slice_names, |
||
| 562 | bit msaa = 0> { |
||
| 563 | AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); |
||
| 564 | string Name = name; // e.g. "2darraymsaa" |
||
| 565 | string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) |
||
| 566 | bits<3> Encoding = enc; |
||
| 567 | bit DA = 0; // DA bit in MIMG encoding |
||
| 568 | bit MSAA = msaa; |
||
| 569 | |||
| 570 | list<AMDGPUArg> CoordSliceArgs = |
||
| 571 | makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; |
||
| 572 | list<AMDGPUArg> CoordSliceIntArgs = |
||
| 573 | makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; |
||
| 574 | list<AMDGPUArg> GradientArgs = |
||
| 575 | makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), |
||
| 576 | !foreach(name, coord_names, "d" # name # "dv")), |
||
| 577 | llvm_anyfloat_ty>.ret; |
||
| 578 | |||
| 579 | bits<8> NumCoords = !size(CoordSliceArgs); |
||
| 580 | bits<8> NumGradients = !size(GradientArgs); |
||
| 581 | } |
||
| 582 | |||
| 583 | def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; |
||
| 584 | def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; |
||
| 585 | def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; |
||
| 586 | let DA = 1 in { |
||
| 587 | def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; |
||
| 588 | def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; |
||
| 589 | def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; |
||
| 590 | } |
||
| 591 | def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; |
||
| 592 | let DA = 1 in { |
||
| 593 | def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; |
||
| 594 | } |
||
| 595 | |||
| 596 | def AMDGPUDims { |
||
| 597 | list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, |
||
| 598 | AMDGPUDimCube, AMDGPUDim1DArray, |
||
| 599 | AMDGPUDim2DArray]; |
||
| 600 | list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; |
||
| 601 | list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); |
||
| 602 | } |
||
| 603 | |||
| 604 | // Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. |
||
| 605 | class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { |
||
| 606 | string UpperCaseMod = ucmod; |
||
| 607 | string LowerCaseMod = lcmod; |
||
| 608 | |||
| 609 | // {offset} {bias} {z-compare} |
||
| 610 | list<AMDGPUArg> ExtraAddrArgs = extra_addr; |
||
| 611 | bit Offset = false; |
||
| 612 | bit Bias = false; |
||
| 613 | bit ZCompare = false; |
||
| 614 | bit Gradients = false; |
||
| 615 | |||
| 616 | // Name of the {lod} or {clamp} argument that is appended to the coordinates, |
||
| 617 | // if any. |
||
| 618 | string LodOrClamp = ""; |
||
| 619 | } |
||
| 620 | |||
| 621 | // AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE |
||
| 622 | // AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 |
||
| 623 | defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { |
||
| 624 | multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, |
||
| 625 | list<AMDGPUArg> extra_addr> { |
||
| 626 | def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; |
||
| 627 | let Offset = true in |
||
| 628 | def NAME#lcmod#_o : AMDGPUSampleVariant< |
||
| 629 | ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; |
||
| 630 | } |
||
| 631 | |||
| 632 | multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, |
||
| 633 | list<AMDGPUArg> extra_addr> { |
||
| 634 | defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; |
||
| 635 | let ZCompare = true in |
||
| 636 | defm NAME : AMDGPUSampleHelper_Offset< |
||
| 637 | "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; |
||
| 638 | } |
||
| 639 | |||
| 640 | multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, |
||
| 641 | list<AMDGPUArg> extra_addr> { |
||
| 642 | defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; |
||
| 643 | let LodOrClamp = "clamp" in |
||
| 644 | defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; |
||
| 645 | } |
||
| 646 | |||
| 647 | defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { |
||
| 648 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; |
||
| 649 | let Bias = true in |
||
| 650 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp< |
||
| 651 | "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; |
||
| 652 | let LodOrClamp = "lod" in |
||
| 653 | defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; |
||
| 654 | defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; |
||
| 655 | } |
||
| 656 | |||
| 657 | let Gradients = true in { |
||
| 658 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; |
||
| 659 | defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; |
||
| 660 | } |
||
| 661 | } |
||
| 662 | |||
| 663 | // Helper class to capture the profile of a dimension-aware image intrinsic. |
||
| 664 | // This information is used to generate the intrinsic's type and to inform |
||
| 665 | // codegen pattern matching. |
||
| 666 | class AMDGPUDimProfile<string opmod, |
||
| 667 | AMDGPUDimProps dim> { |
||
| 668 | AMDGPUDimProps Dim = dim; |
||
| 669 | string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod |
||
| 670 | |||
| 671 | // These are intended to be overwritten by subclasses |
||
| 672 | bit IsSample = false; |
||
| 673 | bit IsAtomic = false; |
||
| 674 | list<LLVMType> RetTypes = []; |
||
| 675 | list<AMDGPUArg> DataArgs = []; |
||
| 676 | list<AMDGPUArg> ExtraAddrArgs = []; |
||
| 677 | bit Offset = false; |
||
| 678 | bit Bias = false; |
||
| 679 | bit ZCompare = false; |
||
| 680 | bit Gradients = false; |
||
| 681 | string LodClampMip = ""; |
||
| 682 | |||
| 683 | int NumRetAndDataAnyTypes = |
||
| 684 | !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, |
||
| 685 | !add(a, b.isAny)); |
||
| 686 | |||
| 687 | list<AMDGPUArg> AddrArgs = |
||
| 688 | arglistconcat<[ExtraAddrArgs, |
||
| 689 | !if(Gradients, dim.GradientArgs, []), |
||
| 690 | !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), |
||
| 691 | !if(!empty(LodClampMip), |
||
| 692 | []<AMDGPUArg>, |
||
| 693 | [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], |
||
| 694 | NumRetAndDataAnyTypes>.ret; |
||
| 695 | list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); |
||
| 696 | list<AMDGPUArg> AddrDefaultArgs = |
||
| 697 | !foreach(arg, AddrArgs, |
||
| 698 | AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), |
||
| 699 | !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), |
||
| 700 | arg.Name>); |
||
| 701 | list<AMDGPUArg> AddrA16Args = |
||
| 702 | !foreach(arg, AddrArgs, |
||
| 703 | AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), |
||
| 704 | !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), |
||
| 705 | arg.Name>); |
||
| 706 | } |
||
| 707 | |||
| 708 | class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { |
||
| 709 | let IsSample = base.IsSample; |
||
| 710 | let IsAtomic = base.IsAtomic; |
||
| 711 | let RetTypes = base.RetTypes; |
||
| 712 | let DataArgs = base.DataArgs; |
||
| 713 | let ExtraAddrArgs = base.ExtraAddrArgs; |
||
| 714 | let Offset = base.Offset; |
||
| 715 | let Bias = base.Bias; |
||
| 716 | let ZCompare = base.ZCompare; |
||
| 717 | let Gradients = base.Gradients; |
||
| 718 | let LodClampMip = base.LodClampMip; |
||
| 719 | } |
||
| 720 | |||
| 721 | class AMDGPUDimSampleProfile<string opmod, |
||
| 722 | AMDGPUDimProps dim, |
||
| 723 | AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { |
||
| 724 | let IsSample = true; |
||
| 725 | let RetTypes = [llvm_any_ty]; |
||
| 726 | let ExtraAddrArgs = sample.ExtraAddrArgs; |
||
| 727 | let Offset = sample.Offset; |
||
| 728 | let Bias = sample.Bias; |
||
| 729 | let ZCompare = sample.ZCompare; |
||
| 730 | let Gradients = sample.Gradients; |
||
| 731 | let LodClampMip = sample.LodOrClamp; |
||
| 732 | } |
||
| 733 | |||
| 734 | class AMDGPUDimNoSampleProfile<string opmod, |
||
| 735 | AMDGPUDimProps dim, |
||
| 736 | list<LLVMType> retty, |
||
| 737 | list<AMDGPUArg> dataargs, |
||
| 738 | bit Mip = false> : AMDGPUDimProfile<opmod, dim> { |
||
| 739 | let RetTypes = retty; |
||
| 740 | let DataArgs = dataargs; |
||
| 741 | let LodClampMip = !if(Mip, "mip", ""); |
||
| 742 | } |
||
| 743 | |||
| 744 | class AMDGPUDimAtomicProfile<string opmod, |
||
| 745 | AMDGPUDimProps dim, |
||
| 746 | list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { |
||
| 747 | let RetTypes = [llvm_anyint_ty]; |
||
| 748 | let DataArgs = dataargs; |
||
| 749 | let IsAtomic = true; |
||
| 750 | } |
||
| 751 | |||
| 752 | class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, |
||
| 753 | list<AMDGPUArg> dataargs> |
||
| 754 | : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { |
||
| 755 | let RetTypes = [llvm_anyfloat_ty]; |
||
| 756 | } |
||
| 757 | |||
| 758 | class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> |
||
| 759 | : AMDGPUDimProfile<"GET_RESINFO", dim> { |
||
| 760 | let RetTypes = [llvm_anyfloat_ty]; |
||
| 761 | let DataArgs = []; |
||
| 762 | let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; |
||
| 763 | let LodClampMip = "mip"; |
||
| 764 | } |
||
| 765 | |||
| 766 | // Helper class for figuring out image intrinsic argument indexes. |
||
| 767 | class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { |
||
| 768 | int NumDataArgs = !size(P_.DataArgs); |
||
| 769 | int NumDmaskArgs = !not(P_.IsAtomic); |
||
| 770 | int NumOffsetArgs = !if(P_.Offset, 1, 0); |
||
| 771 | int NumBiasArgs = !if(P_.Bias, 1, 0); |
||
| 772 | int NumZCompareArgs = !if(P_.ZCompare, 1, 0); |
||
| 773 | int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); |
||
| 774 | int NumVAddrArgs = !size(P_.AddrArgs); |
||
| 775 | int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); |
||
| 776 | int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); |
||
| 777 | int NumRSrcArgs = 1; |
||
| 778 | int NumSampArgs = !if(P_.IsSample, 2, 0); |
||
| 779 | int DmaskArgIndex = NumDataArgs; |
||
| 780 | int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); |
||
| 781 | int OffsetArgIndex = VAddrArgIndex; |
||
| 782 | int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); |
||
| 783 | int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); |
||
| 784 | int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); |
||
| 785 | int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); |
||
| 786 | int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); |
||
| 787 | int MipArgIndex = LodArgIndex; |
||
| 788 | int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); |
||
| 789 | int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); |
||
| 790 | int UnormArgIndex = !add(SampArgIndex, 1); |
||
| 791 | int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); |
||
| 792 | int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); |
||
| 793 | } |
||
| 794 | |||
| 795 | // All dimension-aware intrinsics are derived from this class. |
||
| 796 | class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, |
||
| 797 | list<IntrinsicProperty> props, |
||
| 798 | list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic< |
||
| 799 | P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return |
||
| 800 | !listconcat( |
||
| 801 | !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic |
||
| 802 | !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) |
||
| 803 | P_.AddrTypes, // vaddr(VGPR) |
||
| 804 | [llvm_v8i32_ty], // rsrc(SGPR) |
||
| 805 | !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) |
||
| 806 | llvm_i1_ty], []), // unorm(imm) |
||
| 807 | [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) |
||
| 808 | llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc) |
||
| 809 | |||
| 810 | !listconcat(props, |
||
| 811 | !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), |
||
| 812 | !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), |
||
| 813 | [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, |
||
| 814 | ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]), |
||
| 815 | |||
| 816 | |||
| 817 | "", sdnodeprops>, |
||
| 818 | AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), |
||
| 819 | !if(P_.IsAtomic, 0, 1)), 1> { |
||
| 820 | AMDGPUDimProfile P = P_; |
||
| 821 | |||
| 822 | AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); |
||
| 823 | |||
| 824 | let TargetPrefix = "amdgcn"; |
||
| 825 | } |
||
| 826 | |||
| 827 | // Marker class for intrinsics with a DMask that determines the returned |
||
| 828 | // channels. |
||
| 829 | class AMDGPUImageDMaskIntrinsic; |
||
| 830 | |||
| 831 | defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { |
||
| 832 | |||
| 833 | ////////////////////////////////////////////////////////////////////////// |
||
| 834 | // Load and store intrinsics |
||
| 835 | ////////////////////////////////////////////////////////////////////////// |
||
| 836 | multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, |
||
| 837 | list<LLVMType> retty, |
||
| 838 | list<AMDGPUArg> dataargs, |
||
| 839 | list<IntrinsicProperty> props, |
||
| 840 | list<SDNodeProperty> sdnodeprops, |
||
| 841 | bit Mip = false> { |
||
| 842 | foreach dim = AMDGPUDims.NoMsaa in { |
||
| 843 | def !strconcat(NAME, "_", dim.Name) |
||
| 844 | : AMDGPUImageDimIntrinsic< |
||
| 845 | AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, |
||
| 846 | props, sdnodeprops>; |
||
| 847 | } |
||
| 848 | } |
||
| 849 | |||
| 850 | multiclass AMDGPUImageDimIntrinsicsAll<string opmod, |
||
| 851 | list<LLVMType> retty, |
||
| 852 | list<AMDGPUArg> dataargs, |
||
| 853 | list<IntrinsicProperty> props, |
||
| 854 | list<SDNodeProperty> sdnodeprops, |
||
| 855 | bit Mip = false> { |
||
| 856 | foreach dim = AMDGPUDims.All in { |
||
| 857 | def !strconcat(NAME, "_", dim.Name) |
||
| 858 | : AMDGPUImageDimIntrinsic< |
||
| 859 | AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, |
||
| 860 | props, sdnodeprops>; |
||
| 861 | } |
||
| 862 | } |
||
| 863 | |||
| 864 | defm int_amdgcn_image_load |
||
| 865 | : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], |
||
| 866 | [SDNPMemOperand]>, |
||
| 867 | AMDGPUImageDMaskIntrinsic; |
||
| 868 | defm int_amdgcn_image_load_mip |
||
| 869 | : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], |
||
| 870 | [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, |
||
| 871 | AMDGPUImageDMaskIntrinsic; |
||
| 872 | |||
| 873 | defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< |
||
| 874 | "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], |
||
| 875 | [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>; |
||
| 876 | defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< |
||
| 877 | "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], |
||
| 878 | [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>; |
||
| 879 | |||
| 880 | ////////////////////////////////////////////////////////////////////////// |
||
| 881 | // MSAA intrinsics |
||
| 882 | ////////////////////////////////////////////////////////////////////////// |
||
| 883 | foreach dim = AMDGPUDims.Msaa in { |
||
| 884 | def int_amdgcn_image_msaa_load_x # _ # dim.Name: |
||
| 885 | AMDGPUImageDimIntrinsic< |
||
| 886 | AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, |
||
| 887 | [IntrReadMem], [SDNPMemOperand]>; |
||
| 888 | } |
||
| 889 | |||
| 890 | foreach dim = AMDGPUDims.Msaa in { |
||
| 891 | def int_amdgcn_image_msaa_load # _ # dim.Name: |
||
| 892 | AMDGPUImageDimIntrinsic< |
||
| 893 | AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, |
||
| 894 | [IntrReadMem], [SDNPMemOperand]>; |
||
| 895 | } |
||
| 896 | |||
| 897 | ////////////////////////////////////////////////////////////////////////// |
||
| 898 | // sample and getlod intrinsics |
||
| 899 | ////////////////////////////////////////////////////////////////////////// |
||
| 900 | multiclass AMDGPUImageDimSampleDims<string opmod, |
||
| 901 | AMDGPUSampleVariant sample, |
||
| 902 | bit NoMem = false> { |
||
| 903 | foreach dim = AMDGPUDims.NoMsaa in { |
||
| 904 | def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< |
||
| 905 | AMDGPUDimSampleProfile<opmod, dim, sample>, |
||
| 906 | !if(NoMem, [IntrNoMem], [IntrReadMem]), |
||
| 907 | !if(NoMem, [], [SDNPMemOperand])>; |
||
| 908 | } |
||
| 909 | } |
||
| 910 | |||
| 911 | foreach sample = AMDGPUSampleVariants in { |
||
| 912 | defm int_amdgcn_image_sample # sample.LowerCaseMod |
||
| 913 | : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, |
||
| 914 | AMDGPUImageDMaskIntrinsic; |
||
| 915 | } |
||
| 916 | |||
| 917 | defm int_amdgcn_image_getlod |
||
| 918 | : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, |
||
| 919 | AMDGPUImageDMaskIntrinsic; |
||
| 920 | |||
| 921 | ////////////////////////////////////////////////////////////////////////// |
||
| 922 | // getresinfo intrinsics |
||
| 923 | ////////////////////////////////////////////////////////////////////////// |
||
| 924 | foreach dim = AMDGPUDims.All in { |
||
| 925 | def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) |
||
| 926 | : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, |
||
| 927 | AMDGPUImageDMaskIntrinsic; |
||
| 928 | } |
||
| 929 | |||
| 930 | ////////////////////////////////////////////////////////////////////////// |
||
| 931 | // gather4 intrinsics |
||
| 932 | ////////////////////////////////////////////////////////////////////////// |
||
| 933 | foreach sample = AMDGPUSampleVariantsNoGradients in { |
||
| 934 | foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { |
||
| 935 | def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: |
||
| 936 | AMDGPUImageDimIntrinsic< |
||
| 937 | AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, |
||
| 938 | [IntrReadMem], [SDNPMemOperand]>; |
||
| 939 | } |
||
| 940 | } |
||
| 941 | } |
||
| 942 | |||
| 943 | ////////////////////////////////////////////////////////////////////////// |
||
| 944 | // atomic intrinsics |
||
| 945 | ////////////////////////////////////////////////////////////////////////// |
||
| 946 | defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { |
||
| 947 | multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, |
||
| 948 | int isFloat = 0> { |
||
| 949 | foreach dim = AMDGPUDims.All in { |
||
| 950 | def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< |
||
| 951 | !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, |
||
| 952 | AMDGPUDimAtomicProfile<opmod, dim, dataargs>), |
||
| 953 | [], [SDNPMemOperand]>; |
||
| 954 | } |
||
| 955 | } |
||
| 956 | |||
| 957 | multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { |
||
| 958 | defm "" |
||
| 959 | : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], |
||
| 960 | isFloat>; |
||
| 961 | } |
||
| 962 | |||
| 963 | multiclass AMDGPUImageDimFloatAtomic<string opmod> { |
||
| 964 | defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; |
||
| 965 | } |
||
| 966 | |||
| 967 | defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; |
||
| 968 | defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; |
||
| 969 | defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; |
||
| 970 | defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; |
||
| 971 | defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; |
||
| 972 | defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; |
||
| 973 | defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; |
||
| 974 | defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; |
||
| 975 | defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; |
||
| 976 | defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; |
||
| 977 | defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; |
||
| 978 | defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; |
||
| 979 | defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; |
||
| 980 | defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; |
||
| 981 | |||
| 982 | defm int_amdgcn_image_atomic_cmpswap : |
||
| 983 | AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, |
||
| 984 | AMDGPUArg<LLVMMatchType<0>, "cmp">]>; |
||
| 985 | } |
||
| 986 | |||
| 987 | ////////////////////////////////////////////////////////////////////////// |
||
| 988 | // Buffer intrinsics |
||
| 989 | ////////////////////////////////////////////////////////////////////////// |
||
| 990 | |||
| 991 | let TargetPrefix = "amdgcn" in { |
||
| 992 | |||
| 993 | defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { |
||
| 994 | |||
| 995 | class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
| 996 | [data_ty], |
||
| 997 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 998 | llvm_i32_ty, // vindex(VGPR) |
||
| 999 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
| 1000 | llvm_i1_ty, // glc(imm) |
||
| 1001 | llvm_i1_ty], // slc(imm) |
||
| 1002 | [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
| 1003 | AMDGPURsrcIntrinsic<0>; |
||
| 1004 | def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; |
||
| 1005 | def int_amdgcn_buffer_load : AMDGPUBufferLoad; |
||
| 1006 | |||
| 1007 | // Generate a buffer_load instruction that may be optimized to s_buffer_load if |
||
| 1008 | // the offset argument is uniform. |
||
| 1009 | def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < |
||
| 1010 | [llvm_any_ty], |
||
| 1011 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1012 | llvm_i32_ty, // byte offset |
||
| 1013 | llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) |
||
| 1014 | [IntrNoMem, ImmArg<ArgIndex<2>>]>, |
||
| 1015 | AMDGPURsrcIntrinsic<0>; |
||
| 1016 | |||
| 1017 | class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
| 1018 | [], |
||
| 1019 | [data_ty, // vdata(VGPR) |
||
| 1020 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1021 | llvm_i32_ty, // vindex(VGPR) |
||
| 1022 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
| 1023 | llvm_i1_ty, // glc(imm) |
||
| 1024 | llvm_i1_ty], // slc(imm) |
||
| 1025 | [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
| 1026 | AMDGPURsrcIntrinsic<1>; |
||
| 1027 | def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; |
||
| 1028 | def int_amdgcn_buffer_store : AMDGPUBufferStore; |
||
| 1029 | |||
| 1030 | // New buffer intrinsics with separate raw and struct variants. The raw |
||
| 1031 | // variant never has an index. The struct variant always has an index, even if |
||
| 1032 | // it is const 0. A struct intrinsic with constant 0 index is different to the |
||
| 1033 | // corresponding raw intrinsic on gfx9+ because the behavior of bound checking |
||
| 1034 | // and swizzling changes depending on whether idxen is set in the instruction. |
||
| 1035 | // These new instrinsics also keep the offset and soffset arguments separate as |
||
| 1036 | // they behave differently in bounds checking and swizzling. |
||
| 1037 | class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
| 1038 | [data_ty], |
||
| 1039 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1040 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1041 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1042 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1043 | // bit 1 = slc, |
||
| 1044 | // bit 2 = dlc on gfx10+), |
||
| 1045 | // swizzled buffer (bit 3 = swz)) |
||
| 1046 | [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, |
||
| 1047 | AMDGPURsrcIntrinsic<0>; |
||
| 1048 | def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; |
||
| 1049 | def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; |
||
| 1050 | |||
| 1051 | class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
| 1052 | [data_ty], |
||
| 1053 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1054 | llvm_i32_ty, // vindex(VGPR) |
||
| 1055 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1056 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1057 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1058 | // bit 1 = slc, |
||
| 1059 | // bit 2 = dlc on gfx10+), |
||
| 1060 | // swizzled buffer (bit 3 = swz)) |
||
| 1061 | [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
| 1062 | AMDGPURsrcIntrinsic<0>; |
||
| 1063 | def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; |
||
| 1064 | def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; |
||
| 1065 | |||
| 1066 | class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
| 1067 | [], |
||
| 1068 | [data_ty, // vdata(VGPR) |
||
| 1069 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1070 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1071 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1072 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1073 | // bit 1 = slc, |
||
| 1074 | // bit 2 = dlc on gfx10+), |
||
| 1075 | // swizzled buffer (bit 3 = swz)) |
||
| 1076 | [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
| 1077 | AMDGPURsrcIntrinsic<1>; |
||
| 1078 | def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; |
||
| 1079 | def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; |
||
| 1080 | |||
| 1081 | class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < |
||
| 1082 | [], |
||
| 1083 | [data_ty, // vdata(VGPR) |
||
| 1084 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1085 | llvm_i32_ty, // vindex(VGPR) |
||
| 1086 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1087 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1088 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1089 | // bit 1 = slc, |
||
| 1090 | // bit 2 = dlc on gfx10+), |
||
| 1091 | // swizzled buffer (bit 3 = swz)) |
||
| 1092 | [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
| 1093 | AMDGPURsrcIntrinsic<1>; |
||
| 1094 | def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; |
||
| 1095 | def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; |
||
| 1096 | |||
| 1097 | class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < |
||
| 1098 | !if(NoRtn, [], [data_ty]), |
||
| 1099 | [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) |
||
| 1100 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1101 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1102 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1103 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
| 1104 | [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1105 | AMDGPURsrcIntrinsic<1, 0>; |
||
| 1106 | def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; |
||
| 1107 | def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; |
||
| 1108 | def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; |
||
| 1109 | def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; |
||
| 1110 | def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; |
||
| 1111 | def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; |
||
| 1112 | def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; |
||
| 1113 | def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; |
||
| 1114 | def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; |
||
| 1115 | def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; |
||
| 1116 | def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; |
||
| 1117 | def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; |
||
| 1118 | def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; |
||
| 1119 | def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; |
||
| 1120 | def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< |
||
| 1121 | [llvm_anyint_ty], |
||
| 1122 | [LLVMMatchType<0>, // src(VGPR) |
||
| 1123 | LLVMMatchType<0>, // cmp(VGPR) |
||
| 1124 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1125 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1126 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1127 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
| 1128 | [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1129 | AMDGPURsrcIntrinsic<2, 0>; |
||
| 1130 | |||
| 1131 | // gfx908 intrinsic |
||
| 1132 | def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; |
||
| 1133 | |||
| 1134 | class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < |
||
| 1135 | !if(NoRtn, [], [data_ty]), |
||
| 1136 | [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) |
||
| 1137 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1138 | llvm_i32_ty, // vindex(VGPR) |
||
| 1139 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1140 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1141 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
| 1142 | [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1143 | AMDGPURsrcIntrinsic<1, 0>; |
||
| 1144 | def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; |
||
| 1145 | def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; |
||
| 1146 | def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; |
||
| 1147 | def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; |
||
| 1148 | def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; |
||
| 1149 | def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; |
||
| 1150 | def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; |
||
| 1151 | def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; |
||
| 1152 | def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; |
||
| 1153 | def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; |
||
| 1154 | def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; |
||
| 1155 | def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; |
||
| 1156 | def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< |
||
| 1157 | [llvm_anyint_ty], |
||
| 1158 | [LLVMMatchType<0>, // src(VGPR) |
||
| 1159 | LLVMMatchType<0>, // cmp(VGPR) |
||
| 1160 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1161 | llvm_i32_ty, // vindex(VGPR) |
||
| 1162 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1163 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1164 | llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) |
||
| 1165 | [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1166 | AMDGPURsrcIntrinsic<2, 0>; |
||
| 1167 | |||
| 1168 | // gfx908 intrinsic |
||
| 1169 | def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; |
||
| 1170 | |||
| 1171 | // gfx90a intrinsics |
||
| 1172 | def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; |
||
| 1173 | def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; |
||
| 1174 | |||
| 1175 | |||
| 1176 | // Obsolescent tbuffer intrinsics. |
||
| 1177 | def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < |
||
| 1178 | [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
| 1179 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1180 | llvm_i32_ty, // vindex(VGPR) |
||
| 1181 | llvm_i32_ty, // voffset(VGPR) |
||
| 1182 | llvm_i32_ty, // soffset(SGPR) |
||
| 1183 | llvm_i32_ty, // offset(imm) |
||
| 1184 | llvm_i32_ty, // dfmt(imm) |
||
| 1185 | llvm_i32_ty, // nfmt(imm) |
||
| 1186 | llvm_i1_ty, // glc(imm) |
||
| 1187 | llvm_i1_ty], // slc(imm) |
||
| 1188 | [IntrReadMem, |
||
| 1189 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, |
||
| 1190 | ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, |
||
| 1191 | AMDGPURsrcIntrinsic<0>; |
||
| 1192 | |||
| 1193 | def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < |
||
| 1194 | [], |
||
| 1195 | [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
| 1196 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1197 | llvm_i32_ty, // vindex(VGPR) |
||
| 1198 | llvm_i32_ty, // voffset(VGPR) |
||
| 1199 | llvm_i32_ty, // soffset(SGPR) |
||
| 1200 | llvm_i32_ty, // offset(imm) |
||
| 1201 | llvm_i32_ty, // dfmt(imm) |
||
| 1202 | llvm_i32_ty, // nfmt(imm) |
||
| 1203 | llvm_i1_ty, // glc(imm) |
||
| 1204 | llvm_i1_ty], // slc(imm) |
||
| 1205 | [IntrWriteMem, ImmArg<ArgIndex<5>>, |
||
| 1206 | ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, |
||
| 1207 | ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, |
||
| 1208 | AMDGPURsrcIntrinsic<1>; |
||
| 1209 | |||
| 1210 | // New tbuffer intrinsics, with: |
||
| 1211 | // - raw and struct variants |
||
| 1212 | // - joint format field |
||
| 1213 | // - joint cachepolicy field |
||
| 1214 | def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < |
||
| 1215 | [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
| 1216 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1217 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1218 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1219 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
| 1220 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1221 | // bit 1 = slc, |
||
| 1222 | // bit 2 = dlc on gfx10+), |
||
| 1223 | // swizzled buffer (bit 3 = swz)) |
||
| 1224 | [IntrReadMem, |
||
| 1225 | ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, |
||
| 1226 | AMDGPURsrcIntrinsic<0>; |
||
| 1227 | |||
| 1228 | def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < |
||
| 1229 | [], |
||
| 1230 | [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
| 1231 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1232 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1233 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1234 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
| 1235 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1236 | // bit 1 = slc, |
||
| 1237 | // bit 2 = dlc on gfx10+), |
||
| 1238 | // swizzled buffer (bit 3 = swz)) |
||
| 1239 | [IntrWriteMem, |
||
| 1240 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
| 1241 | AMDGPURsrcIntrinsic<1>; |
||
| 1242 | |||
| 1243 | def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < |
||
| 1244 | [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
| 1245 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1246 | llvm_i32_ty, // vindex(VGPR) |
||
| 1247 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1248 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1249 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
| 1250 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1251 | // bit 1 = slc, |
||
| 1252 | // bit 2 = dlc on gfx10+), |
||
| 1253 | // swizzled buffer (bit 3 = swz)) |
||
| 1254 | [IntrReadMem, |
||
| 1255 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, |
||
| 1256 | AMDGPURsrcIntrinsic<0>; |
||
| 1257 | |||
| 1258 | def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < |
||
| 1259 | [], |
||
| 1260 | [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 |
||
| 1261 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1262 | llvm_i32_ty, // vindex(VGPR) |
||
| 1263 | llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) |
||
| 1264 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1265 | llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) |
||
| 1266 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1267 | // bit 1 = slc, |
||
| 1268 | // bit 2 = dlc on gfx10+), |
||
| 1269 | // swizzled buffer (bit 3 = swz)) |
||
| 1270 | [IntrWriteMem, |
||
| 1271 | ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, |
||
| 1272 | AMDGPURsrcIntrinsic<1>; |
||
| 1273 | |||
| 1274 | class AMDGPUBufferAtomic : Intrinsic < |
||
| 1275 | [llvm_anyint_ty], |
||
| 1276 | [LLVMMatchType<0>, // vdata(VGPR) |
||
| 1277 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1278 | llvm_i32_ty, // vindex(VGPR) |
||
| 1279 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
| 1280 | llvm_i1_ty], // slc(imm) |
||
| 1281 | [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1282 | AMDGPURsrcIntrinsic<1, 0>; |
||
| 1283 | def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; |
||
| 1284 | def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; |
||
| 1285 | def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; |
||
| 1286 | def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; |
||
| 1287 | def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; |
||
| 1288 | def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; |
||
| 1289 | def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; |
||
| 1290 | def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; |
||
| 1291 | def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; |
||
| 1292 | def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; |
||
| 1293 | def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< |
||
| 1294 | [llvm_i32_ty], |
||
| 1295 | [llvm_i32_ty, // src(VGPR) |
||
| 1296 | llvm_i32_ty, // cmp(VGPR) |
||
| 1297 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1298 | llvm_i32_ty, // vindex(VGPR) |
||
| 1299 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
| 1300 | llvm_i1_ty], // slc(imm) |
||
| 1301 | [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1302 | AMDGPURsrcIntrinsic<2, 0>; |
||
| 1303 | |||
| 1304 | def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; |
||
| 1305 | |||
| 1306 | class AMDGPUBufferAtomicFP : Intrinsic < |
||
| 1307 | [llvm_anyfloat_ty], |
||
| 1308 | [LLVMMatchType<0>, // vdata(VGPR) |
||
| 1309 | llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1310 | llvm_i32_ty, // vindex(VGPR) |
||
| 1311 | llvm_i32_ty, // offset(SGPR/VGPR/imm) |
||
| 1312 | llvm_i1_ty], // slc(imm) |
||
| 1313 | [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, |
||
| 1314 | AMDGPURsrcIntrinsic<1, 0>; |
||
| 1315 | |||
| 1316 | // Legacy form of the intrinsic. raw and struct forms should be preferred. |
||
| 1317 | def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; |
||
| 1318 | |||
| 1319 | class AMDGPURawBufferLoadLDS : Intrinsic < |
||
| 1320 | [], |
||
| 1321 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1322 | LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset |
||
| 1323 | llvm_i32_ty, // Data byte size: 1/2/4 |
||
| 1324 | llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) |
||
| 1325 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1326 | llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) |
||
| 1327 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1328 | // bit 1 = slc, |
||
| 1329 | // bit 2 = dlc on gfx10+)) |
||
| 1330 | // swizzled buffer (bit 3 = swz)) |
||
| 1331 | [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, |
||
| 1332 | ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; |
||
| 1333 | def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; |
||
| 1334 | |||
| 1335 | class AMDGPUStructBufferLoadLDS : Intrinsic < |
||
| 1336 | [], |
||
| 1337 | [llvm_v4i32_ty, // rsrc(SGPR) |
||
| 1338 | LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset |
||
| 1339 | llvm_i32_ty, // Data byte size: 1/2/4 |
||
| 1340 | llvm_i32_ty, // vindex(VGPR) |
||
| 1341 | llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) |
||
| 1342 | llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) |
||
| 1343 | llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) |
||
| 1344 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, |
||
| 1345 | // bit 1 = slc, |
||
| 1346 | // bit 2 = dlc on gfx10+)) |
||
| 1347 | // swizzled buffer (bit 3 = swz)) |
||
| 1348 | [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, |
||
| 1349 | ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; |
||
| 1350 | def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; |
||
| 1351 | |||
| 1352 | } // defset AMDGPUBufferIntrinsics |
||
| 1353 | |||
| 1354 | // Uses that do not set the done bit should set IntrWriteMem on the |
||
| 1355 | // call site. |
||
| 1356 | def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ |
||
| 1357 | llvm_i32_ty, // tgt, |
||
| 1358 | llvm_i32_ty, // en |
||
| 1359 | llvm_any_ty, // src0 (f32 or i32) |
||
| 1360 | LLVMMatchType<0>, // src1 |
||
| 1361 | LLVMMatchType<0>, // src2 |
||
| 1362 | LLVMMatchType<0>, // src3 |
||
| 1363 | llvm_i1_ty, // done |
||
| 1364 | llvm_i1_ty // vm (ignored on GFX11+) |
||
| 1365 | ], |
||
| 1366 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, |
||
| 1367 | ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] |
||
| 1368 | >; |
||
| 1369 | |||
| 1370 | // exp with row_en bit set. Only supported on GFX11+. |
||
| 1371 | def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ |
||
| 1372 | llvm_i32_ty, // tgt, |
||
| 1373 | llvm_i32_ty, // en |
||
| 1374 | llvm_any_ty, // src0 (f32 or i32) |
||
| 1375 | LLVMMatchType<0>, // src1 |
||
| 1376 | LLVMMatchType<0>, // src2 |
||
| 1377 | LLVMMatchType<0>, // src3 |
||
| 1378 | llvm_i1_ty, // done |
||
| 1379 | llvm_i32_ty], // row number |
||
| 1380 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, |
||
| 1381 | IntrWriteMem, IntrInaccessibleMemOnly] |
||
| 1382 | >; |
||
| 1383 | |||
| 1384 | // exp with compr bit set. Not supported on GFX11+. |
||
| 1385 | def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ |
||
| 1386 | llvm_i32_ty, // tgt, |
||
| 1387 | llvm_i32_ty, // en |
||
| 1388 | llvm_anyvector_ty, // src0 (v2f16 or v2i16) |
||
| 1389 | LLVMMatchType<0>, // src1 |
||
| 1390 | llvm_i1_ty, // done |
||
| 1391 | llvm_i1_ty], // vm |
||
| 1392 | [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, |
||
| 1393 | ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] |
||
| 1394 | >; |
||
| 1395 | |||
| 1396 | def int_amdgcn_buffer_wbinvl1_sc : |
||
| 1397 | ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, |
||
| 1398 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
| 1399 | |||
| 1400 | def int_amdgcn_buffer_wbinvl1 : |
||
| 1401 | ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, |
||
| 1402 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
| 1403 | |||
| 1404 | def int_amdgcn_s_dcache_inv : |
||
| 1405 | ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, |
||
| 1406 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
| 1407 | |||
| 1408 | def int_amdgcn_s_memtime : |
||
| 1409 | ClangBuiltin<"__builtin_amdgcn_s_memtime">, |
||
| 1410 | DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; |
||
| 1411 | |||
| 1412 | def int_amdgcn_s_sleep : |
||
| 1413 | ClangBuiltin<"__builtin_amdgcn_s_sleep">, |
||
| 1414 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
| 1415 | IntrHasSideEffects]> { |
||
| 1416 | } |
||
| 1417 | |||
| 1418 | def int_amdgcn_s_incperflevel : |
||
| 1419 | ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, |
||
| 1420 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
| 1421 | IntrHasSideEffects]> { |
||
| 1422 | } |
||
| 1423 | |||
| 1424 | def int_amdgcn_s_decperflevel : |
||
| 1425 | ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, |
||
| 1426 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
| 1427 | IntrHasSideEffects]> { |
||
| 1428 | } |
||
| 1429 | |||
| 1430 | def int_amdgcn_s_sethalt : |
||
| 1431 | DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
| 1432 | IntrHasSideEffects]>; |
||
| 1433 | |||
| 1434 | def int_amdgcn_s_setprio : |
||
| 1435 | ClangBuiltin<"__builtin_amdgcn_s_setprio">, |
||
| 1436 | DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, |
||
| 1437 | IntrHasSideEffects]>; |
||
| 1438 | |||
| 1439 | // This is IntrHasSideEffects so it can be used to read cycle counters. |
||
| 1440 | def int_amdgcn_s_getreg : |
||
| 1441 | ClangBuiltin<"__builtin_amdgcn_s_getreg">, |
||
| 1442 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], |
||
| 1443 | [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] |
||
| 1444 | >; |
||
| 1445 | |||
| 1446 | // Note this can be used to set FP environment properties that are |
||
| 1447 | // unsafe to change in non-strictfp functions. The register properties |
||
| 1448 | // available (and value required to access them) may differ per |
||
| 1449 | // subtarget. llvm.amdgcn.s.setreg(hwmode, value) |
||
| 1450 | def int_amdgcn_s_setreg : |
||
| 1451 | ClangBuiltin<"__builtin_amdgcn_s_setreg">, |
||
| 1452 | DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], |
||
| 1453 | [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] |
||
| 1454 | >; |
||
| 1455 | |||
| 1456 | // int_amdgcn_s_getpc is provided to allow a specific style of position |
||
| 1457 | // independent code to determine the high part of its address when it is |
||
| 1458 | // known (through convention) that the code and any data of interest does |
||
| 1459 | // not cross a 4Gb address boundary. Use for any other purpose may not |
||
| 1460 | // produce the desired results as optimizations may cause code movement, |
||
| 1461 | // especially as we explicitly use IntrNoMem to allow optimizations. |
||
| 1462 | def int_amdgcn_s_getpc : |
||
| 1463 | ClangBuiltin<"__builtin_amdgcn_s_getpc">, |
||
| 1464 | DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, |
||
| 1465 | IntrWillReturn]>; |
||
| 1466 | |||
| 1467 | // __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> |
||
| 1468 | // param values: 0 = P10, 1 = P20, 2 = P0 |
||
| 1469 | def int_amdgcn_interp_mov : |
||
| 1470 | ClangBuiltin<"__builtin_amdgcn_interp_mov">, |
||
| 1471 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1472 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1473 | [IntrNoMem, IntrSpeculatable, |
||
| 1474 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; |
||
| 1475 | |||
| 1476 | // __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> |
||
| 1477 | // This intrinsic reads from lds, but the memory values are constant, |
||
| 1478 | // so it behaves like IntrNoMem. |
||
| 1479 | def int_amdgcn_interp_p1 : |
||
| 1480 | ClangBuiltin<"__builtin_amdgcn_interp_p1">, |
||
| 1481 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1482 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1483 | [IntrNoMem, IntrSpeculatable, |
||
| 1484 | ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; |
||
| 1485 | |||
| 1486 | // __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> |
||
| 1487 | def int_amdgcn_interp_p2 : |
||
| 1488 | ClangBuiltin<"__builtin_amdgcn_interp_p2">, |
||
| 1489 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1490 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1491 | [IntrNoMem, IntrSpeculatable, |
||
| 1492 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; |
||
| 1493 | // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. |
||
| 1494 | |||
| 1495 | // __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> |
||
| 1496 | // high selects whether high or low 16-bits are loaded from LDS |
||
| 1497 | def int_amdgcn_interp_p1_f16 : |
||
| 1498 | ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, |
||
| 1499 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1500 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], |
||
| 1501 | [IntrNoMem, IntrSpeculatable, |
||
| 1502 | ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; |
||
| 1503 | |||
| 1504 | // __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> |
||
| 1505 | // high selects whether high or low 16-bits are loaded from LDS |
||
| 1506 | def int_amdgcn_interp_p2_f16 : |
||
| 1507 | ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, |
||
| 1508 | DefaultAttrsIntrinsic<[llvm_half_ty], |
||
| 1509 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], |
||
| 1510 | [IntrNoMem, IntrSpeculatable, |
||
| 1511 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; |
||
| 1512 | |||
| 1513 | // llvm.amdgcn.lds.direct.load <m0> |
||
| 1514 | // The input argument is m0, which contains a packed combination of address |
||
| 1515 | // offset and flags describing the data type. |
||
| 1516 | def int_amdgcn_lds_direct_load : |
||
| 1517 | DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 |
||
| 1518 | [llvm_i32_ty], |
||
| 1519 | [IntrReadMem, IntrSpeculatable]>; |
||
| 1520 | |||
| 1521 | // llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> |
||
| 1522 | // Like interp intrinsics, this reads from lds, but the memory values are constant, |
||
| 1523 | // so it behaves like IntrNoMem. |
||
| 1524 | def int_amdgcn_lds_param_load : |
||
| 1525 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1526 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1527 | [IntrNoMem, IntrSpeculatable, |
||
| 1528 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; |
||
| 1529 | |||
| 1530 | // llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> |
||
| 1531 | def int_amdgcn_interp_inreg_p10 : |
||
| 1532 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1533 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 1534 | [IntrNoMem, IntrSpeculatable]>; |
||
| 1535 | |||
| 1536 | // llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> |
||
| 1537 | def int_amdgcn_interp_inreg_p2 : |
||
| 1538 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1539 | [llvm_float_ty, llvm_float_ty, llvm_float_ty], |
||
| 1540 | [IntrNoMem, IntrSpeculatable]>; |
||
| 1541 | |||
| 1542 | // llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> |
||
| 1543 | // high selects whether high or low 16-bits are used for p and p0 operands |
||
| 1544 | def int_amdgcn_interp_inreg_p10_f16: |
||
| 1545 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 1546 | [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], |
||
| 1547 | [IntrNoMem, IntrSpeculatable, |
||
| 1548 | ImmArg<ArgIndex<3>>]>; |
||
| 1549 | |||
| 1550 | // llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> |
||
| 1551 | // high selects whether high or low 16-bits are used for p operand |
||
| 1552 | def int_amdgcn_interp_inreg_p2_f16 : |
||
| 1553 | DefaultAttrsIntrinsic<[llvm_half_ty], |
||
| 1554 | [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], |
||
| 1555 | [IntrNoMem, IntrSpeculatable, |
||
| 1556 | ImmArg<ArgIndex<3>>]>; |
||
| 1557 | |||
| 1558 | // Deprecated: use llvm.amdgcn.live.mask instead. |
||
| 1559 | def int_amdgcn_ps_live : DefaultAttrsIntrinsic < |
||
| 1560 | [llvm_i1_ty], |
||
| 1561 | [], |
||
| 1562 | [IntrNoMem]>; |
||
| 1563 | |||
| 1564 | // Query currently live lanes. |
||
| 1565 | // Returns true if lane is live (and not a helper lane). |
||
| 1566 | def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], |
||
| 1567 | [], [IntrReadMem, IntrInaccessibleMemOnly] |
||
| 1568 | >; |
||
| 1569 | |||
| 1570 | def int_amdgcn_mbcnt_lo : |
||
| 1571 | ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, |
||
| 1572 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1573 | [IntrNoMem]>; |
||
| 1574 | |||
| 1575 | def int_amdgcn_mbcnt_hi : |
||
| 1576 | ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, |
||
| 1577 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1578 | [IntrNoMem]>; |
||
| 1579 | |||
| 1580 | // llvm.amdgcn.ds.swizzle src offset |
||
| 1581 | def int_amdgcn_ds_swizzle : |
||
| 1582 | ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, |
||
| 1583 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1584 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, |
||
| 1585 | ImmArg<ArgIndex<1>>]>; |
||
| 1586 | |||
| 1587 | def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], |
||
| 1588 | [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], |
||
| 1589 | [IntrNoMem, IntrSpeculatable] |
||
| 1590 | >; |
||
| 1591 | |||
| 1592 | def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], |
||
| 1593 | [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], |
||
| 1594 | [IntrNoMem, IntrSpeculatable] |
||
| 1595 | >; |
||
| 1596 | |||
| 1597 | def int_amdgcn_lerp : |
||
| 1598 | ClangBuiltin<"__builtin_amdgcn_lerp">, |
||
| 1599 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1600 | [IntrNoMem, IntrSpeculatable] |
||
| 1601 | >; |
||
| 1602 | |||
| 1603 | def int_amdgcn_sad_u8 : |
||
| 1604 | ClangBuiltin<"__builtin_amdgcn_sad_u8">, |
||
| 1605 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1606 | [IntrNoMem, IntrSpeculatable] |
||
| 1607 | >; |
||
| 1608 | |||
| 1609 | def int_amdgcn_msad_u8 : |
||
| 1610 | ClangBuiltin<"__builtin_amdgcn_msad_u8">, |
||
| 1611 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1612 | [IntrNoMem, IntrSpeculatable] |
||
| 1613 | >; |
||
| 1614 | |||
| 1615 | def int_amdgcn_sad_hi_u8 : |
||
| 1616 | ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, |
||
| 1617 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1618 | [IntrNoMem, IntrSpeculatable] |
||
| 1619 | >; |
||
| 1620 | |||
| 1621 | def int_amdgcn_sad_u16 : |
||
| 1622 | ClangBuiltin<"__builtin_amdgcn_sad_u16">, |
||
| 1623 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1624 | [IntrNoMem, IntrSpeculatable] |
||
| 1625 | >; |
||
| 1626 | |||
| 1627 | def int_amdgcn_qsad_pk_u16_u8 : |
||
| 1628 | ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, |
||
| 1629 | DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], |
||
| 1630 | [IntrNoMem, IntrSpeculatable] |
||
| 1631 | >; |
||
| 1632 | |||
| 1633 | def int_amdgcn_mqsad_pk_u16_u8 : |
||
| 1634 | ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, |
||
| 1635 | DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], |
||
| 1636 | [IntrNoMem, IntrSpeculatable] |
||
| 1637 | >; |
||
| 1638 | |||
| 1639 | def int_amdgcn_mqsad_u32_u8 : |
||
| 1640 | ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, |
||
| 1641 | DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], |
||
| 1642 | [IntrNoMem, IntrSpeculatable] |
||
| 1643 | >; |
||
| 1644 | |||
| 1645 | def int_amdgcn_cvt_pk_u8_f32 : |
||
| 1646 | ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, |
||
| 1647 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1648 | [IntrNoMem, IntrSpeculatable] |
||
| 1649 | >; |
||
| 1650 | |||
| 1651 | def int_amdgcn_icmp : |
||
| 1652 | Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], |
||
| 1653 | [IntrNoMem, IntrConvergent, |
||
| 1654 | ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1655 | |||
| 1656 | def int_amdgcn_fcmp : |
||
| 1657 | Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], |
||
| 1658 | [IntrNoMem, IntrConvergent, |
||
| 1659 | ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1660 | |||
| 1661 | def int_amdgcn_ballot : |
||
| 1662 | Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], |
||
| 1663 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1664 | |||
| 1665 | def int_amdgcn_readfirstlane : |
||
| 1666 | ClangBuiltin<"__builtin_amdgcn_readfirstlane">, |
||
| 1667 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty], |
||
| 1668 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1669 | |||
| 1670 | // The lane argument must be uniform across the currently active threads of the |
||
| 1671 | // current wave. Otherwise, the result is undefined. |
||
| 1672 | def int_amdgcn_readlane : |
||
| 1673 | ClangBuiltin<"__builtin_amdgcn_readlane">, |
||
| 1674 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1675 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1676 | |||
| 1677 | // The value to write and lane select arguments must be uniform across the |
||
| 1678 | // currently active threads of the current wave. Otherwise, the result is |
||
| 1679 | // undefined. |
||
| 1680 | def int_amdgcn_writelane : |
||
| 1681 | ClangBuiltin<"__builtin_amdgcn_writelane">, |
||
| 1682 | Intrinsic<[llvm_i32_ty], [ |
||
| 1683 | llvm_i32_ty, // uniform value to write: returned by the selected lane |
||
| 1684 | llvm_i32_ty, // uniform lane select |
||
| 1685 | llvm_i32_ty // returned by all lanes other than the selected one |
||
| 1686 | ], |
||
| 1687 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1688 | >; |
||
| 1689 | |||
| 1690 | def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, |
||
| 1691 | DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1692 | [IntrNoMem, IntrSpeculatable] |
||
| 1693 | >; |
||
| 1694 | |||
| 1695 | def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 1696 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1697 | [IntrNoMem, IntrSpeculatable] |
||
| 1698 | >; |
||
| 1699 | |||
| 1700 | def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 1701 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1702 | [IntrNoMem, IntrSpeculatable] |
||
| 1703 | >; |
||
| 1704 | |||
| 1705 | def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 1706 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1707 | [IntrNoMem, IntrSpeculatable] |
||
| 1708 | >; |
||
| 1709 | |||
| 1710 | def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 1711 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1712 | [IntrNoMem, IntrSpeculatable] |
||
| 1713 | >; |
||
| 1714 | |||
| 1715 | // llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) |
||
| 1716 | // |
||
| 1717 | // bar_val is the total number of waves that will wait on this |
||
| 1718 | // barrier, minus 1. |
||
| 1719 | def int_amdgcn_ds_gws_init : |
||
| 1720 | ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, |
||
| 1721 | Intrinsic<[], |
||
| 1722 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1723 | [IntrConvergent, IntrWriteMem, |
||
| 1724 | IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
| 1725 | [SDNPMemOperand] |
||
| 1726 | >; |
||
| 1727 | |||
| 1728 | // llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) |
||
| 1729 | // bar_val is the total number of waves that will wait on this |
||
| 1730 | // barrier, minus 1. |
||
| 1731 | def int_amdgcn_ds_gws_barrier : |
||
| 1732 | ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, |
||
| 1733 | Intrinsic<[], |
||
| 1734 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1735 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
| 1736 | [SDNPMemOperand] |
||
| 1737 | >; |
||
| 1738 | |||
| 1739 | // llvm.amdgcn.ds.gws.sema.v(i32 resource_id) |
||
| 1740 | def int_amdgcn_ds_gws_sema_v : |
||
| 1741 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, |
||
| 1742 | Intrinsic<[], |
||
| 1743 | [llvm_i32_ty], |
||
| 1744 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
| 1745 | [SDNPMemOperand] |
||
| 1746 | >; |
||
| 1747 | |||
| 1748 | // llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) |
||
| 1749 | def int_amdgcn_ds_gws_sema_br : |
||
| 1750 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, |
||
| 1751 | Intrinsic<[], |
||
| 1752 | [llvm_i32_ty, llvm_i32_ty], |
||
| 1753 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
| 1754 | [SDNPMemOperand] |
||
| 1755 | >; |
||
| 1756 | |||
| 1757 | // llvm.amdgcn.ds.gws.sema.p(i32 resource_id) |
||
| 1758 | def int_amdgcn_ds_gws_sema_p : |
||
| 1759 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, |
||
| 1760 | Intrinsic<[], |
||
| 1761 | [llvm_i32_ty], |
||
| 1762 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
| 1763 | [SDNPMemOperand] |
||
| 1764 | >; |
||
| 1765 | |||
| 1766 | // llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) |
||
| 1767 | def int_amdgcn_ds_gws_sema_release_all : |
||
| 1768 | ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, |
||
| 1769 | Intrinsic<[], |
||
| 1770 | [llvm_i32_ty], |
||
| 1771 | [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", |
||
| 1772 | [SDNPMemOperand] |
||
| 1773 | >; |
||
| 1774 | |||
| 1775 | |||
| 1776 | // Copies the source value to the destination value, with the guarantee that |
||
| 1777 | // the source value is computed as if the entire program were executed in WQM. |
||
| 1778 | def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], |
||
| 1779 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1780 | >; |
||
| 1781 | |||
| 1782 | // Copies the source value to the destination value, such that the source |
||
| 1783 | // is computed as if the entire program were executed in WQM if any other |
||
| 1784 | // program code executes in WQM. |
||
| 1785 | def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], |
||
| 1786 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1787 | >; |
||
| 1788 | |||
| 1789 | // Return true if at least one thread within the pixel quad passes true into |
||
| 1790 | // the function. |
||
| 1791 | def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], |
||
| 1792 | [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1793 | >; |
||
| 1794 | |||
| 1795 | // If false, set EXEC=0 for the current thread until the end of program. |
||
| 1796 | // FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? |
||
| 1797 | def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; |
||
| 1798 | |||
| 1799 | def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, |
||
| 1800 | Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrNoCallback, IntrNoFree] |
||
| 1801 | >; |
||
| 1802 | |||
| 1803 | // If false, mark all active lanes as helper lanes until the end of program. |
||
| 1804 | def int_amdgcn_wqm_demote : Intrinsic<[], |
||
| 1805 | [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] |
||
| 1806 | >; |
||
| 1807 | |||
| 1808 | // Copies the active channels of the source value to the destination value, |
||
| 1809 | // with the guarantee that the source value is computed as if the entire |
||
| 1810 | // program were executed in Whole Wavefront Mode, i.e. with all channels |
||
| 1811 | // enabled, with a few exceptions: - Phi nodes which require WWM return an |
||
| 1812 | // undefined value. |
||
| 1813 | def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], |
||
| 1814 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, |
||
| 1815 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1816 | >; |
||
| 1817 | // Deprecated. Use int_amdgcn_strict_wwm instead. |
||
| 1818 | def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], |
||
| 1819 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, |
||
| 1820 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1821 | >; |
||
| 1822 | def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], |
||
| 1823 | [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, |
||
| 1824 | IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 1825 | >; |
||
| 1826 | |||
| 1827 | // Given a value, copies it while setting all the inactive lanes to a given |
||
| 1828 | // value. Note that OpenGL helper lanes are considered active, so if the |
||
| 1829 | // program ever uses WQM, then the instruction and the first source will be |
||
| 1830 | // computed in WQM. |
||
| 1831 | def int_amdgcn_set_inactive : |
||
| 1832 | Intrinsic<[llvm_anyint_ty], |
||
| 1833 | [LLVMMatchType<0>, // value to be copied |
||
| 1834 | LLVMMatchType<0>], // value for the inactive lanes to take |
||
| 1835 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1836 | |||
| 1837 | // Return if the given flat pointer points to a local memory address. |
||
| 1838 | def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, |
||
| 1839 | DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], |
||
| 1840 | [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] |
||
| 1841 | >; |
||
| 1842 | |||
| 1843 | // Return if the given flat pointer points to a prvate memory address. |
||
| 1844 | def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, |
||
| 1845 | DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], |
||
| 1846 | [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] |
||
| 1847 | >; |
||
| 1848 | |||
| 1849 | //===----------------------------------------------------------------------===// |
||
| 1850 | // CI+ Intrinsics |
||
| 1851 | //===----------------------------------------------------------------------===// |
||
| 1852 | |||
| 1853 | def int_amdgcn_s_dcache_inv_vol : |
||
| 1854 | ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, |
||
| 1855 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
| 1856 | |||
| 1857 | def int_amdgcn_buffer_wbinvl1_vol : |
||
| 1858 | ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, |
||
| 1859 | DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; |
||
| 1860 | |||
| 1861 | //===----------------------------------------------------------------------===// |
||
| 1862 | // VI Intrinsics |
||
| 1863 | //===----------------------------------------------------------------------===// |
||
| 1864 | |||
| 1865 | // llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> |
||
| 1866 | def int_amdgcn_mov_dpp : |
||
| 1867 | Intrinsic<[llvm_anyint_ty], |
||
| 1868 | [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, |
||
| 1869 | llvm_i1_ty], |
||
| 1870 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
| 1871 | ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, |
||
| 1872 | ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; |
||
| 1873 | |||
| 1874 | // llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> |
||
| 1875 | // Should be equivalent to: |
||
| 1876 | // v_mov_b32 <dest> <old> |
||
| 1877 | // v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> |
||
| 1878 | def int_amdgcn_update_dpp : |
||
| 1879 | Intrinsic<[llvm_anyint_ty], |
||
| 1880 | [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, |
||
| 1881 | llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], |
||
| 1882 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
| 1883 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, |
||
| 1884 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; |
||
| 1885 | |||
| 1886 | def int_amdgcn_s_dcache_wb : |
||
| 1887 | ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, |
||
| 1888 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1889 | |||
| 1890 | def int_amdgcn_s_dcache_wb_vol : |
||
| 1891 | ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, |
||
| 1892 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1893 | |||
| 1894 | def int_amdgcn_s_memrealtime : |
||
| 1895 | ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, |
||
| 1896 | Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1897 | |||
| 1898 | // llvm.amdgcn.ds.permute <index> <src> |
||
| 1899 | def int_amdgcn_ds_permute : |
||
| 1900 | ClangBuiltin<"__builtin_amdgcn_ds_permute">, |
||
| 1901 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1902 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1903 | |||
| 1904 | // llvm.amdgcn.ds.bpermute <index> <src> |
||
| 1905 | def int_amdgcn_ds_bpermute : |
||
| 1906 | ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, |
||
| 1907 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1908 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1909 | |||
| 1910 | // llvm.amdgcn.perm <src0> <src1> <selector> |
||
| 1911 | def int_amdgcn_perm : |
||
| 1912 | ClangBuiltin<"__builtin_amdgcn_perm">, |
||
| 1913 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 1914 | [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1915 | |||
| 1916 | //===----------------------------------------------------------------------===// |
||
| 1917 | // GFX9 Intrinsics |
||
| 1918 | //===----------------------------------------------------------------------===// |
||
| 1919 | |||
| 1920 | class AMDGPUGlobalLoadLDS : Intrinsic < |
||
| 1921 | [], |
||
| 1922 | [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from |
||
| 1923 | LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to |
||
| 1924 | llvm_i32_ty, // Data byte size: 1/2/4 |
||
| 1925 | llvm_i32_ty, // imm offset (applied to both global and LDS address) |
||
| 1926 | llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, |
||
| 1927 | // bit 1 = slc/sc1, |
||
| 1928 | // bit 2 = dlc on gfx10+)) |
||
| 1929 | // bit 4 = scc/nt on gfx90a+)) |
||
| 1930 | [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, |
||
| 1931 | ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], |
||
| 1932 | "", [SDNPMemOperand]>; |
||
| 1933 | def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; |
||
| 1934 | |||
| 1935 | //===----------------------------------------------------------------------===// |
||
| 1936 | // GFX10 Intrinsics |
||
| 1937 | //===----------------------------------------------------------------------===// |
||
| 1938 | |||
| 1939 | // llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> |
||
| 1940 | def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, |
||
| 1941 | Intrinsic<[llvm_i32_ty], |
||
| 1942 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], |
||
| 1943 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
| 1944 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; |
||
| 1945 | |||
| 1946 | // llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> |
||
| 1947 | def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, |
||
| 1948 | Intrinsic<[llvm_i32_ty], |
||
| 1949 | [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], |
||
| 1950 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
| 1951 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; |
||
| 1952 | |||
| 1953 | // llvm.amdgcn.mov.dpp8.i32 <src> <sel> |
||
| 1954 | // <sel> is a 32-bit constant whose high 8 bits must be zero which selects |
||
| 1955 | // the lanes to read from. |
||
| 1956 | def int_amdgcn_mov_dpp8 : |
||
| 1957 | Intrinsic<[llvm_anyint_ty], |
||
| 1958 | [LLVMMatchType<0>, llvm_i32_ty], |
||
| 1959 | [IntrNoMem, IntrConvergent, IntrWillReturn, |
||
| 1960 | ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; |
||
| 1961 | |||
| 1962 | def int_amdgcn_s_get_waveid_in_workgroup : |
||
| 1963 | ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, |
||
| 1964 | Intrinsic<[llvm_i32_ty], [], |
||
| 1965 | [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1966 | |||
| 1967 | class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic < |
||
| 1968 | [vt], |
||
| 1969 | [llvm_anyptr_ty, // vaddr |
||
| 1970 | vt], // vdata(VGPR) |
||
| 1971 | [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", |
||
| 1972 | [SDNPMemOperand]>; |
||
| 1973 | |||
| 1974 | def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>; |
||
| 1975 | |||
| 1976 | // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, |
||
| 1977 | // <ray_dir>, <ray_inv_dir>, <texture_descr> |
||
| 1978 | // <node_ptr> is i32 or i64. |
||
| 1979 | // <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. |
||
| 1980 | def int_amdgcn_image_bvh_intersect_ray : |
||
| 1981 | DefaultAttrsIntrinsic<[llvm_v4i32_ty], |
||
| 1982 | [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, |
||
| 1983 | LLVMMatchType<1>, llvm_v4i32_ty], |
||
| 1984 | [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1985 | |||
| 1986 | //===----------------------------------------------------------------------===// |
||
| 1987 | // GFX11 Intrinsics |
||
| 1988 | //===----------------------------------------------------------------------===// |
||
| 1989 | |||
| 1990 | // llvm.amdgcn.permlane64 <src0> |
||
| 1991 | def int_amdgcn_permlane64 : |
||
| 1992 | ClangBuiltin<"__builtin_amdgcn_permlane64">, |
||
| 1993 | Intrinsic<[llvm_i32_ty], [llvm_i32_ty], |
||
| 1994 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 1995 | |||
| 1996 | def int_amdgcn_ds_add_gs_reg_rtn : |
||
| 1997 | ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, |
||
| 1998 | Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 1999 | [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 2000 | |||
| 2001 | def int_amdgcn_ds_sub_gs_reg_rtn : |
||
| 2002 | ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, |
||
| 2003 | Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], |
||
| 2004 | [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 2005 | |||
| 2006 | def int_amdgcn_ds_bvh_stack_rtn : |
||
| 2007 | Intrinsic< |
||
| 2008 | [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr |
||
| 2009 | [ |
||
| 2010 | llvm_i32_ty, // %addr |
||
| 2011 | llvm_i32_ty, // %data0 |
||
| 2012 | llvm_v4i32_ty, // %data1 |
||
| 2013 | llvm_i32_ty, // %offset |
||
| 2014 | ], |
||
| 2015 | [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2016 | >; |
||
| 2017 | |||
| 2018 | // WMMA (Wave Matrix Multiply-Accumulate) intrinsics |
||
| 2019 | // |
||
| 2020 | // These operations perform a matrix multiplication and accumulation of |
||
| 2021 | // the form: D = A * B + C . |
||
| 2022 | |||
| 2023 | class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : |
||
| 2024 | Intrinsic< |
||
| 2025 | [CD], // %D |
||
| 2026 | [ |
||
| 2027 | AB, // %A |
||
| 2028 | AB, // %B |
||
| 2029 | LLVMMatchType<0>, // %C |
||
| 2030 | ], |
||
| 2031 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2032 | >; |
||
| 2033 | |||
| 2034 | class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : |
||
| 2035 | Intrinsic< |
||
| 2036 | [CD], // %D |
||
| 2037 | [ |
||
| 2038 | AB, // %A |
||
| 2039 | AB, // %B |
||
| 2040 | LLVMMatchType<0>, // %C |
||
| 2041 | llvm_i1_ty, // %high |
||
| 2042 | ], |
||
| 2043 | [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2044 | >; |
||
| 2045 | |||
| 2046 | class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : |
||
| 2047 | Intrinsic< |
||
| 2048 | [CD], // %D |
||
| 2049 | [ |
||
| 2050 | llvm_i1_ty, // %A_sign |
||
| 2051 | AB, // %A |
||
| 2052 | llvm_i1_ty, // %B_sign |
||
| 2053 | AB, // %B |
||
| 2054 | LLVMMatchType<0>, // %C |
||
| 2055 | llvm_i1_ty, // %clamp |
||
| 2056 | ], |
||
| 2057 | [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2058 | >; |
||
| 2059 | |||
| 2060 | def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>; |
||
| 2061 | def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>; |
||
| 2062 | def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; |
||
| 2063 | def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; |
||
| 2064 | def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>; |
||
| 2065 | def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>; |
||
| 2066 | |||
| 2067 | def int_amdgcn_s_wait_event_export_ready : |
||
| 2068 | ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, |
||
| 2069 | Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] |
||
| 2070 | >; |
||
| 2071 | |||
| 2072 | //===----------------------------------------------------------------------===// |
||
| 2073 | // Deep learning intrinsics. |
||
| 2074 | //===----------------------------------------------------------------------===// |
||
| 2075 | |||
| 2076 | // f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) |
||
| 2077 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
| 2078 | def int_amdgcn_fdot2 : |
||
| 2079 | ClangBuiltin<"__builtin_amdgcn_fdot2">, |
||
| 2080 | DefaultAttrsIntrinsic< |
||
| 2081 | [llvm_float_ty], // %r |
||
| 2082 | [ |
||
| 2083 | llvm_v2f16_ty, // %a |
||
| 2084 | llvm_v2f16_ty, // %b |
||
| 2085 | llvm_float_ty, // %c |
||
| 2086 | llvm_i1_ty // %clamp |
||
| 2087 | ], |
||
| 2088 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2089 | >; |
||
| 2090 | |||
| 2091 | // f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) |
||
| 2092 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
| 2093 | def int_amdgcn_fdot2_f16_f16 : |
||
| 2094 | ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, |
||
| 2095 | DefaultAttrsIntrinsic< |
||
| 2096 | [llvm_half_ty], // %r |
||
| 2097 | [ |
||
| 2098 | llvm_v2f16_ty, // %a |
||
| 2099 | llvm_v2f16_ty, // %b |
||
| 2100 | llvm_half_ty // %c |
||
| 2101 | ], |
||
| 2102 | [IntrNoMem, IntrSpeculatable] |
||
| 2103 | >; |
||
| 2104 | |||
| 2105 | // bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) |
||
| 2106 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
| 2107 | def int_amdgcn_fdot2_bf16_bf16 : |
||
| 2108 | ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, |
||
| 2109 | DefaultAttrsIntrinsic< |
||
| 2110 | [llvm_i16_ty], // %r |
||
| 2111 | [ |
||
| 2112 | llvm_v2i16_ty, // %a |
||
| 2113 | llvm_v2i16_ty, // %b |
||
| 2114 | llvm_i16_ty // %c |
||
| 2115 | ], |
||
| 2116 | [IntrNoMem, IntrSpeculatable] |
||
| 2117 | >; |
||
| 2118 | |||
| 2119 | // f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) |
||
| 2120 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
| 2121 | def int_amdgcn_fdot2_f32_bf16 : |
||
| 2122 | ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, |
||
| 2123 | DefaultAttrsIntrinsic< |
||
| 2124 | [llvm_float_ty], // %r |
||
| 2125 | [ |
||
| 2126 | llvm_v2i16_ty, // %a |
||
| 2127 | llvm_v2i16_ty, // %b |
||
| 2128 | llvm_float_ty, // %c |
||
| 2129 | llvm_i1_ty // %clamp |
||
| 2130 | ], |
||
| 2131 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2132 | >; |
||
| 2133 | |||
| 2134 | // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) |
||
| 2135 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
| 2136 | def int_amdgcn_sdot2 : |
||
| 2137 | ClangBuiltin<"__builtin_amdgcn_sdot2">, |
||
| 2138 | DefaultAttrsIntrinsic< |
||
| 2139 | [llvm_i32_ty], // %r |
||
| 2140 | [ |
||
| 2141 | llvm_v2i16_ty, // %a |
||
| 2142 | llvm_v2i16_ty, // %b |
||
| 2143 | llvm_i32_ty, // %c |
||
| 2144 | llvm_i1_ty // %clamp |
||
| 2145 | ], |
||
| 2146 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2147 | >; |
||
| 2148 | |||
| 2149 | // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) |
||
| 2150 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c |
||
| 2151 | def int_amdgcn_udot2 : |
||
| 2152 | ClangBuiltin<"__builtin_amdgcn_udot2">, |
||
| 2153 | DefaultAttrsIntrinsic< |
||
| 2154 | [llvm_i32_ty], // %r |
||
| 2155 | [ |
||
| 2156 | llvm_v2i16_ty, // %a |
||
| 2157 | llvm_v2i16_ty, // %b |
||
| 2158 | llvm_i32_ty, // %c |
||
| 2159 | llvm_i1_ty // %clamp |
||
| 2160 | ], |
||
| 2161 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2162 | >; |
||
| 2163 | |||
| 2164 | // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) |
||
| 2165 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c |
||
| 2166 | def int_amdgcn_sdot4 : |
||
| 2167 | ClangBuiltin<"__builtin_amdgcn_sdot4">, |
||
| 2168 | DefaultAttrsIntrinsic< |
||
| 2169 | [llvm_i32_ty], // %r |
||
| 2170 | [ |
||
| 2171 | llvm_i32_ty, // %a |
||
| 2172 | llvm_i32_ty, // %b |
||
| 2173 | llvm_i32_ty, // %c |
||
| 2174 | llvm_i1_ty // %clamp |
||
| 2175 | ], |
||
| 2176 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2177 | >; |
||
| 2178 | |||
| 2179 | // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) |
||
| 2180 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c |
||
| 2181 | def int_amdgcn_udot4 : |
||
| 2182 | ClangBuiltin<"__builtin_amdgcn_udot4">, |
||
| 2183 | DefaultAttrsIntrinsic< |
||
| 2184 | [llvm_i32_ty], // %r |
||
| 2185 | [ |
||
| 2186 | llvm_i32_ty, // %a |
||
| 2187 | llvm_i32_ty, // %b |
||
| 2188 | llvm_i32_ty, // %c |
||
| 2189 | llvm_i1_ty // %clamp |
||
| 2190 | ], |
||
| 2191 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2192 | >; |
||
| 2193 | |||
| 2194 | // i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) |
||
| 2195 | // Treat input as signed (_sign = 1) or unsigned (_sign = 0). |
||
| 2196 | // a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); |
||
| 2197 | // b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); |
||
| 2198 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c |
||
| 2199 | def int_amdgcn_sudot4 : |
||
| 2200 | ClangBuiltin<"__builtin_amdgcn_sudot4">, |
||
| 2201 | DefaultAttrsIntrinsic< |
||
| 2202 | [llvm_i32_ty], // %r |
||
| 2203 | [ |
||
| 2204 | llvm_i1_ty, // %a_sign |
||
| 2205 | llvm_i32_ty, // %a |
||
| 2206 | llvm_i1_ty, // %b_sign |
||
| 2207 | llvm_i32_ty, // %b |
||
| 2208 | llvm_i32_ty, // %c |
||
| 2209 | llvm_i1_ty // %clamp |
||
| 2210 | ], |
||
| 2211 | [IntrNoMem, IntrSpeculatable, |
||
| 2212 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] |
||
| 2213 | >; |
||
| 2214 | |||
| 2215 | // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) |
||
| 2216 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + |
||
| 2217 | // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c |
||
| 2218 | def int_amdgcn_sdot8 : |
||
| 2219 | ClangBuiltin<"__builtin_amdgcn_sdot8">, |
||
| 2220 | DefaultAttrsIntrinsic< |
||
| 2221 | [llvm_i32_ty], // %r |
||
| 2222 | [ |
||
| 2223 | llvm_i32_ty, // %a |
||
| 2224 | llvm_i32_ty, // %b |
||
| 2225 | llvm_i32_ty, // %c |
||
| 2226 | llvm_i1_ty // %clamp |
||
| 2227 | ], |
||
| 2228 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2229 | >; |
||
| 2230 | |||
| 2231 | // u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) |
||
| 2232 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + |
||
| 2233 | // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c |
||
| 2234 | def int_amdgcn_udot8 : |
||
| 2235 | ClangBuiltin<"__builtin_amdgcn_udot8">, |
||
| 2236 | DefaultAttrsIntrinsic< |
||
| 2237 | [llvm_i32_ty], // %r |
||
| 2238 | [ |
||
| 2239 | llvm_i32_ty, // %a |
||
| 2240 | llvm_i32_ty, // %b |
||
| 2241 | llvm_i32_ty, // %c |
||
| 2242 | llvm_i1_ty // %clamp |
||
| 2243 | ], |
||
| 2244 | [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] |
||
| 2245 | >; |
||
| 2246 | |||
| 2247 | // i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) |
||
| 2248 | // Treat input as signed (_sign = 1) or unsigned (_sign = 0). |
||
| 2249 | // a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); |
||
| 2250 | // b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); |
||
| 2251 | // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + |
||
| 2252 | // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c |
||
| 2253 | def int_amdgcn_sudot8 : |
||
| 2254 | ClangBuiltin<"__builtin_amdgcn_sudot8">, |
||
| 2255 | DefaultAttrsIntrinsic< |
||
| 2256 | [llvm_i32_ty], // %r |
||
| 2257 | [ |
||
| 2258 | llvm_i1_ty, // %a_sign |
||
| 2259 | llvm_i32_ty, // %a |
||
| 2260 | llvm_i1_ty, // %b_sign |
||
| 2261 | llvm_i32_ty, // %b |
||
| 2262 | llvm_i32_ty, // %c |
||
| 2263 | llvm_i1_ty // %clamp |
||
| 2264 | ], |
||
| 2265 | [IntrNoMem, IntrSpeculatable, |
||
| 2266 | ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] |
||
| 2267 | >; |
||
| 2268 | |||
| 2269 | //===----------------------------------------------------------------------===// |
||
| 2270 | // gfx908 intrinsics |
||
| 2271 | // ===----------------------------------------------------------------------===// |
||
| 2272 | |||
| 2273 | def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
| 2274 | |||
| 2275 | // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp |
||
| 2276 | class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : |
||
| 2277 | ClangBuiltin<!subst("int", "__builtin", NAME)>, |
||
| 2278 | DefaultAttrsIntrinsic<[DestTy], |
||
| 2279 | [SrcABTy, SrcABTy, DestTy, |
||
| 2280 | llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 2281 | [IntrConvergent, IntrNoMem, |
||
| 2282 | ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; |
||
| 2283 | |||
| 2284 | def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; |
||
| 2285 | def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; |
||
| 2286 | def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; |
||
| 2287 | def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; |
||
| 2288 | def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; |
||
| 2289 | def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; |
||
| 2290 | def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; |
||
| 2291 | def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; |
||
| 2292 | def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; |
||
| 2293 | def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; |
||
| 2294 | def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; |
||
| 2295 | def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; |
||
| 2296 | def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; |
||
| 2297 | def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; |
||
| 2298 | def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; |
||
| 2299 | def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; |
||
| 2300 | def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; |
||
| 2301 | def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; |
||
| 2302 | def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; |
||
| 2303 | def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; |
||
| 2304 | |||
| 2305 | //===----------------------------------------------------------------------===// |
||
| 2306 | // gfx90a intrinsics |
||
| 2307 | // ===----------------------------------------------------------------------===// |
||
| 2308 | |||
| 2309 | def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
| 2310 | def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
| 2311 | def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
| 2312 | def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
| 2313 | def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; |
||
| 2314 | |||
| 2315 | def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; |
||
| 2316 | def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; |
||
| 2317 | def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; |
||
| 2318 | def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; |
||
| 2319 | def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; |
||
| 2320 | |||
| 2321 | // Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. |
||
| 2322 | // Three bits corresponding to the neg modifier applied to the respective |
||
| 2323 | // source operand. |
||
| 2324 | def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; |
||
| 2325 | def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; |
||
| 2326 | |||
| 2327 | //===----------------------------------------------------------------------===// |
||
| 2328 | // gfx940 intrinsics |
||
| 2329 | // ===----------------------------------------------------------------------===// |
||
| 2330 | |||
| 2331 | // bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. |
||
| 2332 | def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; |
||
| 2333 | def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; |
||
| 2334 | def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< |
||
| 2335 | [llvm_v2i16_ty], |
||
| 2336 | [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty], |
||
| 2337 | [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>, |
||
| 2338 | ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; |
||
| 2339 | |||
| 2340 | def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; |
||
| 2341 | def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; |
||
| 2342 | def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; |
||
| 2343 | def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; |
||
| 2344 | |||
| 2345 | class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : |
||
| 2346 | AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; |
||
| 2347 | |||
| 2348 | multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { |
||
| 2349 | foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in |
||
| 2350 | def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; |
||
| 2351 | } |
||
| 2352 | |||
| 2353 | defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; |
||
| 2354 | defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; |
||
| 2355 | |||
| 2356 | // llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid |
||
| 2357 | class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : |
||
| 2358 | ClangBuiltin<!subst("int", "__builtin", NAME)>, |
||
| 2359 | DefaultAttrsIntrinsic<[DestTy], |
||
| 2360 | [SrcA, SrcB, DestTy, llvm_i32_ty, |
||
| 2361 | llvm_i32_ty, llvm_i32_ty], |
||
| 2362 | [IntrConvergent, IntrNoMem, |
||
| 2363 | ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; |
||
| 2364 | |||
| 2365 | def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; |
||
| 2366 | def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; |
||
| 2367 | def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; |
||
| 2368 | def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; |
||
| 2369 | def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; |
||
| 2370 | def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; |
||
| 2371 | |||
| 2372 | class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : |
||
| 2373 | AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; |
||
| 2374 | |||
| 2375 | multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { |
||
| 2376 | foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in |
||
| 2377 | def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; |
||
| 2378 | } |
||
| 2379 | |||
| 2380 | defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; |
||
| 2381 | defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; |
||
| 2382 | |||
| 2383 | // llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] |
||
| 2384 | // byte_sel selects byte from srcA. |
||
| 2385 | def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, |
||
| 2386 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 2387 | [llvm_i32_ty, llvm_i32_ty], |
||
| 2388 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
| 2389 | |||
| 2390 | // llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] |
||
| 2391 | def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, |
||
| 2392 | DefaultAttrsIntrinsic<[llvm_float_ty], |
||
| 2393 | [llvm_i32_ty, llvm_i32_ty], |
||
| 2394 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
| 2395 | |||
| 2396 | // llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel |
||
| 2397 | // word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. |
||
| 2398 | def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, |
||
| 2399 | DefaultAttrsIntrinsic<[llvm_v2f32_ty], |
||
| 2400 | [llvm_i32_ty, llvm_i1_ty], |
||
| 2401 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
| 2402 | |||
| 2403 | // llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. |
||
| 2404 | def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, |
||
| 2405 | DefaultAttrsIntrinsic<[llvm_v2f32_ty], |
||
| 2406 | [llvm_i32_ty, llvm_i1_ty], |
||
| 2407 | [IntrNoMem, ImmArg<ArgIndex<1>>]>; |
||
| 2408 | |||
| 2409 | // llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel |
||
| 2410 | // word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. |
||
| 2411 | def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, |
||
| 2412 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 2413 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], |
||
| 2414 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
| 2415 | |||
| 2416 | // llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel |
||
| 2417 | def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, |
||
| 2418 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 2419 | [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], |
||
| 2420 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
| 2421 | |||
| 2422 | // llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] |
||
| 2423 | // byte_sel selects byte to write into vdst. |
||
| 2424 | def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, |
||
| 2425 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 2426 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 2427 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
| 2428 | |||
| 2429 | // llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] |
||
| 2430 | def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, |
||
| 2431 | DefaultAttrsIntrinsic<[llvm_i32_ty], |
||
| 2432 | [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], |
||
| 2433 | [IntrNoMem, ImmArg<ArgIndex<3>>]>; |
||
| 2434 | |||
| 2435 | // Represent a relocation constant. |
||
| 2436 | def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< |
||
| 2437 | [llvm_i32_ty], [llvm_metadata_ty], |
||
| 2438 | [IntrNoMem, IntrSpeculatable] |
||
| 2439 | >; |
||
| 2440 | |||
| 2441 | //===----------------------------------------------------------------------===// |
||
| 2442 | // Special Intrinsics for backend internal use only. No frontend |
||
| 2443 | // should emit calls to these. |
||
| 2444 | // ===----------------------------------------------------------------------===// |
||
| 2445 | def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], |
||
| 2446 | [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2447 | >; |
||
| 2448 | |||
| 2449 | def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], |
||
| 2450 | [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2451 | >; |
||
| 2452 | |||
| 2453 | def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], |
||
| 2454 | [llvm_i1_ty, LLVMMatchType<0>], |
||
| 2455 | [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2456 | >; |
||
| 2457 | |||
| 2458 | def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], |
||
| 2459 | [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] |
||
| 2460 | >; |
||
| 2461 | |||
| 2462 | def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], |
||
| 2463 | [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; |
||
| 2464 | |||
| 2465 | // Represent unreachable in a divergent region. |
||
| 2466 | def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; |
||
| 2467 | |||
| 2468 | // Emit 2.5 ulp, no denormal division. Should only be inserted by |
||
| 2469 | // pass based on !fpmath metadata. |
||
| 2470 | def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< |
||
| 2471 | [llvm_float_ty], [llvm_float_ty, llvm_float_ty], |
||
| 2472 | [IntrNoMem, IntrSpeculatable] |
||
| 2473 | >; |
||
| 2474 | } |