Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | // |
||
9 | // This file defines the classes used to describe a subtarget's machine model |
||
10 | // for scheduling and other instruction cost heuristics. |
||
11 | // |
||
12 | //===----------------------------------------------------------------------===// |
||
13 | |||
14 | #ifndef LLVM_MC_MCSCHEDULE_H |
||
15 | #define LLVM_MC_MCSCHEDULE_H |
||
16 | |||
17 | #include "llvm/Config/llvm-config.h" |
||
18 | #include "llvm/Support/DataTypes.h" |
||
19 | #include <cassert> |
||
20 | |||
21 | namespace llvm { |
||
22 | |||
23 | template <typename T> class ArrayRef; |
||
24 | struct InstrItinerary; |
||
25 | class MCSubtargetInfo; |
||
26 | class MCInstrInfo; |
||
27 | class MCInst; |
||
28 | class InstrItineraryData; |
||
29 | |||
30 | /// Define a kind of processor resource that will be modeled by the scheduler. |
||
31 | struct MCProcResourceDesc { |
||
32 | const char *Name; |
||
33 | unsigned NumUnits; // Number of resource of this kind |
||
34 | unsigned SuperIdx; // Index of the resources kind that contains this kind. |
||
35 | |||
36 | // Number of resources that may be buffered. |
||
37 | // |
||
38 | // Buffered resources (BufferSize != 0) may be consumed at some indeterminate |
||
39 | // cycle after dispatch. This should be used for out-of-order cpus when |
||
40 | // instructions that use this resource can be buffered in a reservaton |
||
41 | // station. |
||
42 | // |
||
43 | // Unbuffered resources (BufferSize == 0) always consume their resource some |
||
44 | // fixed number of cycles after dispatch. If a resource is unbuffered, then |
||
45 | // the scheduler will avoid scheduling instructions with conflicting resources |
||
46 | // in the same cycle. This is for in-order cpus, or the in-order portion of |
||
47 | // an out-of-order cpus. |
||
48 | int BufferSize; |
||
49 | |||
50 | // If the resource has sub-units, a pointer to the first element of an array |
||
51 | // of `NumUnits` elements containing the ProcResourceIdx of the sub units. |
||
52 | // nullptr if the resource does not have sub-units. |
||
53 | const unsigned *SubUnitsIdxBegin; |
||
54 | |||
55 | bool operator==(const MCProcResourceDesc &Other) const { |
||
56 | return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx |
||
57 | && BufferSize == Other.BufferSize; |
||
58 | } |
||
59 | }; |
||
60 | |||
61 | /// Identify one of the processor resource kinds consumed by a particular |
||
62 | /// scheduling class for the specified number of cycles. |
||
63 | struct MCWriteProcResEntry { |
||
64 | uint16_t ProcResourceIdx; |
||
65 | uint16_t Cycles; |
||
66 | |||
67 | bool operator==(const MCWriteProcResEntry &Other) const { |
||
68 | return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles; |
||
69 | } |
||
70 | }; |
||
71 | |||
72 | /// Specify the latency in cpu cycles for a particular scheduling class and def |
||
73 | /// index. -1 indicates an invalid latency. Heuristics would typically consider |
||
74 | /// an instruction with invalid latency to have infinite latency. Also identify |
||
75 | /// the WriteResources of this def. When the operand expands to a sequence of |
||
76 | /// writes, this ID is the last write in the sequence. |
||
77 | struct MCWriteLatencyEntry { |
||
78 | int16_t Cycles; |
||
79 | uint16_t WriteResourceID; |
||
80 | |||
81 | bool operator==(const MCWriteLatencyEntry &Other) const { |
||
82 | return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID; |
||
83 | } |
||
84 | }; |
||
85 | |||
86 | /// Specify the number of cycles allowed after instruction issue before a |
||
87 | /// particular use operand reads its registers. This effectively reduces the |
||
88 | /// write's latency. Here we allow negative cycles for corner cases where |
||
89 | /// latency increases. This rule only applies when the entry's WriteResource |
||
90 | /// matches the write's WriteResource. |
||
91 | /// |
||
92 | /// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by |
||
93 | /// WriteResourceIdx. |
||
94 | struct MCReadAdvanceEntry { |
||
95 | unsigned UseIdx; |
||
96 | unsigned WriteResourceID; |
||
97 | int Cycles; |
||
98 | |||
99 | bool operator==(const MCReadAdvanceEntry &Other) const { |
||
100 | return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID |
||
101 | && Cycles == Other.Cycles; |
||
102 | } |
||
103 | }; |
||
104 | |||
105 | /// Summarize the scheduling resources required for an instruction of a |
||
106 | /// particular scheduling class. |
||
107 | /// |
||
108 | /// Defined as an aggregate struct for creating tables with initializer lists. |
||
109 | struct MCSchedClassDesc { |
||
110 | static const unsigned short InvalidNumMicroOps = (1U << 13) - 1; |
||
111 | static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1; |
||
112 | |||
113 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
||
114 | const char* Name; |
||
115 | #endif |
||
116 | uint16_t NumMicroOps : 13; |
||
117 | uint16_t BeginGroup : 1; |
||
118 | uint16_t EndGroup : 1; |
||
119 | uint16_t RetireOOO : 1; |
||
120 | uint16_t WriteProcResIdx; // First index into WriteProcResTable. |
||
121 | uint16_t NumWriteProcResEntries; |
||
122 | uint16_t WriteLatencyIdx; // First index into WriteLatencyTable. |
||
123 | uint16_t NumWriteLatencyEntries; |
||
124 | uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable. |
||
125 | uint16_t NumReadAdvanceEntries; |
||
126 | |||
127 | bool isValid() const { |
||
128 | return NumMicroOps != InvalidNumMicroOps; |
||
129 | } |
||
130 | bool isVariant() const { |
||
131 | return NumMicroOps == VariantNumMicroOps; |
||
132 | } |
||
133 | }; |
||
134 | |||
135 | /// Specify the cost of a register definition in terms of number of physical |
||
136 | /// register allocated at register renaming stage. For example, AMD Jaguar. |
||
137 | /// natively supports 128-bit data types, and operations on 256-bit registers |
||
138 | /// (i.e. YMM registers) are internally split into two COPs (complex operations) |
||
139 | /// and each COP updates a physical register. Basically, on Jaguar, a YMM |
||
140 | /// register write effectively consumes two physical registers. That means, |
||
141 | /// the cost of a YMM write in the BtVer2 model is 2. |
||
142 | struct MCRegisterCostEntry { |
||
143 | unsigned RegisterClassID; |
||
144 | unsigned Cost; |
||
145 | bool AllowMoveElimination; |
||
146 | }; |
||
147 | |||
148 | /// A register file descriptor. |
||
149 | /// |
||
150 | /// This struct allows to describe processor register files. In particular, it |
||
151 | /// helps describing the size of the register file, as well as the cost of |
||
152 | /// allocating a register file at register renaming stage. |
||
153 | /// FIXME: this struct can be extended to provide information about the number |
||
154 | /// of read/write ports to the register file. A value of zero for field |
||
155 | /// 'NumPhysRegs' means: this register file has an unbounded number of physical |
||
156 | /// registers. |
||
157 | struct MCRegisterFileDesc { |
||
158 | const char *Name; |
||
159 | uint16_t NumPhysRegs; |
||
160 | uint16_t NumRegisterCostEntries; |
||
161 | // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. |
||
162 | uint16_t RegisterCostEntryIdx; |
||
163 | // A value of zero means: there is no limit in the number of moves that can be |
||
164 | // eliminated every cycle. |
||
165 | uint16_t MaxMovesEliminatedPerCycle; |
||
166 | // Ture if this register file only knows how to optimize register moves from |
||
167 | // known zero registers. |
||
168 | bool AllowZeroMoveEliminationOnly; |
||
169 | }; |
||
170 | |||
171 | /// Provide extra details about the machine processor. |
||
172 | /// |
||
173 | /// This is a collection of "optional" processor information that is not |
||
174 | /// normally used by the LLVM machine schedulers, but that can be consumed by |
||
175 | /// external tools like llvm-mca to improve the quality of the peformance |
||
176 | /// analysis. |
||
177 | struct MCExtraProcessorInfo { |
||
178 | // Actual size of the reorder buffer in hardware. |
||
179 | unsigned ReorderBufferSize; |
||
180 | // Number of instructions retired per cycle. |
||
181 | unsigned MaxRetirePerCycle; |
||
182 | const MCRegisterFileDesc *RegisterFiles; |
||
183 | unsigned NumRegisterFiles; |
||
184 | const MCRegisterCostEntry *RegisterCostTable; |
||
185 | unsigned NumRegisterCostEntries; |
||
186 | unsigned LoadQueueID; |
||
187 | unsigned StoreQueueID; |
||
188 | }; |
||
189 | |||
190 | /// Machine model for scheduling, bundling, and heuristics. |
||
191 | /// |
||
192 | /// The machine model directly provides basic information about the |
||
193 | /// microarchitecture to the scheduler in the form of properties. It also |
||
194 | /// optionally refers to scheduler resource tables and itinerary |
||
195 | /// tables. Scheduler resource tables model the latency and cost for each |
||
196 | /// instruction type. Itinerary tables are an independent mechanism that |
||
197 | /// provides a detailed reservation table describing each cycle of instruction |
||
198 | /// execution. Subtargets may define any or all of the above categories of data |
||
199 | /// depending on the type of CPU and selected scheduler. |
||
200 | /// |
||
201 | /// The machine independent properties defined here are used by the scheduler as |
||
202 | /// an abstract machine model. A real micro-architecture has a number of |
||
203 | /// buffers, queues, and stages. Declaring that a given machine-independent |
||
204 | /// abstract property corresponds to a specific physical property across all |
||
205 | /// subtargets can't be done. Nonetheless, the abstract model is |
||
206 | /// useful. Futhermore, subtargets typically extend this model with processor |
||
207 | /// specific resources to model any hardware features that can be exploited by |
||
208 | /// scheduling heuristics and aren't sufficiently represented in the abstract. |
||
209 | /// |
||
210 | /// The abstract pipeline is built around the notion of an "issue point". This |
||
211 | /// is merely a reference point for counting machine cycles. The physical |
||
212 | /// machine will have pipeline stages that delay execution. The scheduler does |
||
213 | /// not model those delays because they are irrelevant as long as they are |
||
214 | /// consistent. Inaccuracies arise when instructions have different execution |
||
215 | /// delays relative to each other, in addition to their intrinsic latency. Those |
||
216 | /// special cases can be handled by TableGen constructs such as, ReadAdvance, |
||
217 | /// which reduces latency when reading data, and ResourceCycles, which consumes |
||
218 | /// a processor resource when writing data for a number of abstract |
||
219 | /// cycles. |
||
220 | /// |
||
221 | /// TODO: One tool currently missing is the ability to add a delay to |
||
222 | /// ResourceCycles. That would be easy to add and would likely cover all cases |
||
223 | /// currently handled by the legacy itinerary tables. |
||
224 | /// |
||
225 | /// A note on out-of-order execution and, more generally, instruction |
||
226 | /// buffers. Part of the CPU pipeline is always in-order. The issue point, which |
||
227 | /// is the point of reference for counting cycles, only makes sense as an |
||
228 | /// in-order part of the pipeline. Other parts of the pipeline are sometimes |
||
229 | /// falling behind and sometimes catching up. It's only interesting to model |
||
230 | /// those other, decoupled parts of the pipeline if they may be predictably |
||
231 | /// resource constrained in a way that the scheduler can exploit. |
||
232 | /// |
||
233 | /// The LLVM machine model distinguishes between in-order constraints and |
||
234 | /// out-of-order constraints so that the target's scheduling strategy can apply |
||
235 | /// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order |
||
236 | /// resources would not typically be treated as a hard scheduling |
||
237 | /// constraint. For example, in the GenericScheduler, a delay caused by limited |
||
238 | /// out-of-order resources is not directly reflected in the number of cycles |
||
239 | /// that the scheduler sees between issuing an instruction and its dependent |
||
240 | /// instructions. In other words, out-of-order resources don't directly increase |
||
241 | /// the latency between pairs of instructions. However, they can still be used |
||
242 | /// to detect potential bottlenecks across a sequence of instructions and bias |
||
243 | /// the scheduling heuristics appropriately. |
||
244 | struct MCSchedModel { |
||
245 | // IssueWidth is the maximum number of instructions that may be scheduled in |
||
246 | // the same per-cycle group. This is meant to be a hard in-order constraint |
||
247 | // (a.k.a. "hazard"). In the GenericScheduler strategy, no more than |
||
248 | // IssueWidth micro-ops can ever be scheduled in a particular cycle. |
||
249 | // |
||
250 | // In practice, IssueWidth is useful to model any bottleneck between the |
||
251 | // decoder (after micro-op expansion) and the out-of-order reservation |
||
252 | // stations or the decoder bandwidth itself. If the total number of |
||
253 | // reservation stations is also a bottleneck, or if any other pipeline stage |
||
254 | // has a bandwidth limitation, then that can be naturally modeled by adding an |
||
255 | // out-of-order processor resource. |
||
256 | unsigned IssueWidth; |
||
257 | static const unsigned DefaultIssueWidth = 1; |
||
258 | |||
259 | // MicroOpBufferSize is the number of micro-ops that the processor may buffer |
||
260 | // for out-of-order execution. |
||
261 | // |
||
262 | // "0" means operations that are not ready in this cycle are not considered |
||
263 | // for scheduling (they go in the pending queue). Latency is paramount. This |
||
264 | // may be more efficient if many instructions are pending in a schedule. |
||
265 | // |
||
266 | // "1" means all instructions are considered for scheduling regardless of |
||
267 | // whether they are ready in this cycle. Latency still causes issue stalls, |
||
268 | // but we balance those stalls against other heuristics. |
||
269 | // |
||
270 | // "> 1" means the processor is out-of-order. This is a machine independent |
||
271 | // estimate of highly machine specific characteristics such as the register |
||
272 | // renaming pool and reorder buffer. |
||
273 | unsigned MicroOpBufferSize; |
||
274 | static const unsigned DefaultMicroOpBufferSize = 0; |
||
275 | |||
276 | // LoopMicroOpBufferSize is the number of micro-ops that the processor may |
||
277 | // buffer for optimized loop execution. More generally, this represents the |
||
278 | // optimal number of micro-ops in a loop body. A loop may be partially |
||
279 | // unrolled to bring the count of micro-ops in the loop body closer to this |
||
280 | // number. |
||
281 | unsigned LoopMicroOpBufferSize; |
||
282 | static const unsigned DefaultLoopMicroOpBufferSize = 0; |
||
283 | |||
284 | // LoadLatency is the expected latency of load instructions. |
||
285 | unsigned LoadLatency; |
||
286 | static const unsigned DefaultLoadLatency = 4; |
||
287 | |||
288 | // HighLatency is the expected latency of "very high latency" operations. |
||
289 | // See TargetInstrInfo::isHighLatencyDef(). |
||
290 | // By default, this is set to an arbitrarily high number of cycles |
||
291 | // likely to have some impact on scheduling heuristics. |
||
292 | unsigned HighLatency; |
||
293 | static const unsigned DefaultHighLatency = 10; |
||
294 | |||
295 | // MispredictPenalty is the typical number of extra cycles the processor |
||
296 | // takes to recover from a branch misprediction. |
||
297 | unsigned MispredictPenalty; |
||
298 | static const unsigned DefaultMispredictPenalty = 10; |
||
299 | |||
300 | bool PostRAScheduler; // default value is false |
||
301 | |||
302 | bool CompleteModel; |
||
303 | |||
304 | unsigned ProcID; |
||
305 | const MCProcResourceDesc *ProcResourceTable; |
||
306 | const MCSchedClassDesc *SchedClassTable; |
||
307 | unsigned NumProcResourceKinds; |
||
308 | unsigned NumSchedClasses; |
||
309 | // Instruction itinerary tables used by InstrItineraryData. |
||
310 | friend class InstrItineraryData; |
||
311 | const InstrItinerary *InstrItineraries; |
||
312 | |||
313 | const MCExtraProcessorInfo *ExtraProcessorInfo; |
||
314 | |||
315 | bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; } |
||
316 | |||
317 | unsigned getProcessorID() const { return ProcID; } |
||
318 | |||
319 | /// Does this machine model include instruction-level scheduling. |
||
320 | bool hasInstrSchedModel() const { return SchedClassTable; } |
||
321 | |||
322 | const MCExtraProcessorInfo &getExtraProcessorInfo() const { |
||
323 | assert(hasExtraProcessorInfo() && |
||
324 | "No extra information available for this model"); |
||
325 | return *ExtraProcessorInfo; |
||
326 | } |
||
327 | |||
328 | /// Return true if this machine model data for all instructions with a |
||
329 | /// scheduling class (itinerary class or SchedRW list). |
||
330 | bool isComplete() const { return CompleteModel; } |
||
331 | |||
332 | /// Return true if machine supports out of order execution. |
||
333 | bool isOutOfOrder() const { return MicroOpBufferSize > 1; } |
||
334 | |||
335 | unsigned getNumProcResourceKinds() const { |
||
336 | return NumProcResourceKinds; |
||
337 | } |
||
338 | |||
339 | const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const { |
||
340 | assert(hasInstrSchedModel() && "No scheduling machine model"); |
||
341 | |||
342 | assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx"); |
||
343 | return &ProcResourceTable[ProcResourceIdx]; |
||
344 | } |
||
345 | |||
346 | const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const { |
||
347 | assert(hasInstrSchedModel() && "No scheduling machine model"); |
||
348 | |||
349 | assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx"); |
||
350 | return &SchedClassTable[SchedClassIdx]; |
||
351 | } |
||
352 | |||
353 | /// Returns the latency value for the scheduling class. |
||
354 | static int computeInstrLatency(const MCSubtargetInfo &STI, |
||
355 | const MCSchedClassDesc &SCDesc); |
||
356 | |||
357 | int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const; |
||
358 | int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, |
||
359 | const MCInst &Inst) const; |
||
360 | |||
361 | // Returns the reciprocal throughput information from a MCSchedClassDesc. |
||
362 | static double |
||
363 | getReciprocalThroughput(const MCSubtargetInfo &STI, |
||
364 | const MCSchedClassDesc &SCDesc); |
||
365 | |||
366 | static double |
||
367 | getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID); |
||
368 | |||
369 | double |
||
370 | getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, |
||
371 | const MCInst &Inst) const; |
||
372 | |||
373 | /// Returns the maximum forwarding delay for register reads dependent on |
||
374 | /// writes of scheduling class WriteResourceIdx. |
||
375 | static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, |
||
376 | unsigned WriteResourceIdx = 0); |
||
377 | |||
378 | /// Returns the default initialized model. |
||
379 | static const MCSchedModel &GetDefaultSchedModel() { return Default; } |
||
380 | static const MCSchedModel Default; |
||
381 | }; |
||
382 | |||
383 | } // namespace llvm |
||
384 | |||
385 | #endif |