Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | // | ||
| 9 | // This file defines the classes used to describe a subtarget's machine model | ||
| 10 | // for scheduling and other instruction cost heuristics. | ||
| 11 | // | ||
| 12 | //===----------------------------------------------------------------------===// | ||
| 13 | |||
| 14 | #ifndef LLVM_MC_MCSCHEDULE_H | ||
| 15 | #define LLVM_MC_MCSCHEDULE_H | ||
| 16 | |||
| 17 | #include "llvm/Config/llvm-config.h" | ||
| 18 | #include "llvm/Support/DataTypes.h" | ||
| 19 | #include <cassert> | ||
| 20 | |||
| 21 | namespace llvm { | ||
| 22 | |||
| 23 | template <typename T> class ArrayRef; | ||
| 24 | struct InstrItinerary; | ||
| 25 | class MCSubtargetInfo; | ||
| 26 | class MCInstrInfo; | ||
| 27 | class MCInst; | ||
| 28 | class InstrItineraryData; | ||
| 29 | |||
| 30 | /// Define a kind of processor resource that will be modeled by the scheduler. | ||
| 31 | struct MCProcResourceDesc { | ||
| 32 | const char *Name; | ||
| 33 | unsigned NumUnits; // Number of resource of this kind | ||
| 34 | unsigned SuperIdx; // Index of the resources kind that contains this kind. | ||
| 35 | |||
| 36 |   // Number of resources that may be buffered. | ||
| 37 |   // | ||
| 38 |   // Buffered resources (BufferSize != 0) may be consumed at some indeterminate | ||
| 39 |   // cycle after dispatch. This should be used for out-of-order cpus when | ||
| 40 |   // instructions that use this resource can be buffered in a reservaton | ||
| 41 |   // station. | ||
| 42 |   // | ||
| 43 |   // Unbuffered resources (BufferSize == 0) always consume their resource some | ||
| 44 |   // fixed number of cycles after dispatch. If a resource is unbuffered, then | ||
| 45 |   // the scheduler will avoid scheduling instructions with conflicting resources | ||
| 46 |   // in the same cycle. This is for in-order cpus, or the in-order portion of | ||
| 47 |   // an out-of-order cpus. | ||
| 48 | int BufferSize; | ||
| 49 | |||
| 50 |   // If the resource has sub-units, a pointer to the first element of an array | ||
| 51 |   // of `NumUnits` elements containing the ProcResourceIdx of the sub units. | ||
| 52 |   // nullptr if the resource does not have sub-units. | ||
| 53 | const unsigned *SubUnitsIdxBegin; | ||
| 54 | |||
| 55 | bool operator==(const MCProcResourceDesc &Other) const { | ||
| 56 | return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx | ||
| 57 | && BufferSize == Other.BufferSize; | ||
| 58 |   } | ||
| 59 | }; | ||
| 60 | |||
| 61 | /// Identify one of the processor resource kinds consumed by a particular | ||
| 62 | /// scheduling class for the specified number of cycles. | ||
| 63 | struct MCWriteProcResEntry { | ||
| 64 | uint16_t ProcResourceIdx; | ||
| 65 | uint16_t Cycles; | ||
| 66 | |||
| 67 | bool operator==(const MCWriteProcResEntry &Other) const { | ||
| 68 | return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles; | ||
| 69 |   } | ||
| 70 | }; | ||
| 71 | |||
| 72 | /// Specify the latency in cpu cycles for a particular scheduling class and def | ||
| 73 | /// index. -1 indicates an invalid latency. Heuristics would typically consider | ||
| 74 | /// an instruction with invalid latency to have infinite latency.  Also identify | ||
| 75 | /// the WriteResources of this def. When the operand expands to a sequence of | ||
| 76 | /// writes, this ID is the last write in the sequence. | ||
| 77 | struct MCWriteLatencyEntry { | ||
| 78 | int16_t Cycles; | ||
| 79 | uint16_t WriteResourceID; | ||
| 80 | |||
| 81 | bool operator==(const MCWriteLatencyEntry &Other) const { | ||
| 82 | return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID; | ||
| 83 |   } | ||
| 84 | }; | ||
| 85 | |||
| 86 | /// Specify the number of cycles allowed after instruction issue before a | ||
| 87 | /// particular use operand reads its registers. This effectively reduces the | ||
| 88 | /// write's latency. Here we allow negative cycles for corner cases where | ||
| 89 | /// latency increases. This rule only applies when the entry's WriteResource | ||
| 90 | /// matches the write's WriteResource. | ||
| 91 | /// | ||
| 92 | /// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by | ||
| 93 | /// WriteResourceIdx. | ||
| 94 | struct MCReadAdvanceEntry { | ||
| 95 | unsigned UseIdx; | ||
| 96 | unsigned WriteResourceID; | ||
| 97 | int Cycles; | ||
| 98 | |||
| 99 | bool operator==(const MCReadAdvanceEntry &Other) const { | ||
| 100 | return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID | ||
| 101 | && Cycles == Other.Cycles; | ||
| 102 |   } | ||
| 103 | }; | ||
| 104 | |||
| 105 | /// Summarize the scheduling resources required for an instruction of a | ||
| 106 | /// particular scheduling class. | ||
| 107 | /// | ||
| 108 | /// Defined as an aggregate struct for creating tables with initializer lists. | ||
| 109 | struct MCSchedClassDesc { | ||
| 110 | static const unsigned short InvalidNumMicroOps = (1U << 13) - 1; | ||
| 111 | static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1; | ||
| 112 | |||
| 113 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | ||
| 114 | const char* Name; | ||
| 115 | #endif | ||
| 116 | uint16_t NumMicroOps : 13; | ||
| 117 | uint16_t BeginGroup : 1; | ||
| 118 | uint16_t EndGroup : 1; | ||
| 119 | uint16_t RetireOOO : 1; | ||
| 120 | uint16_t WriteProcResIdx; // First index into WriteProcResTable. | ||
| 121 | uint16_t NumWriteProcResEntries; | ||
| 122 | uint16_t WriteLatencyIdx; // First index into WriteLatencyTable. | ||
| 123 | uint16_t NumWriteLatencyEntries; | ||
| 124 | uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable. | ||
| 125 | uint16_t NumReadAdvanceEntries; | ||
| 126 | |||
| 127 | bool isValid() const { | ||
| 128 | return NumMicroOps != InvalidNumMicroOps; | ||
| 129 |   } | ||
| 130 | bool isVariant() const { | ||
| 131 | return NumMicroOps == VariantNumMicroOps; | ||
| 132 |   } | ||
| 133 | }; | ||
| 134 | |||
| 135 | /// Specify the cost of a register definition in terms of number of physical | ||
| 136 | /// register allocated at register renaming stage. For example, AMD Jaguar. | ||
| 137 | /// natively supports 128-bit data types, and operations on 256-bit registers | ||
| 138 | /// (i.e. YMM registers) are internally split into two COPs (complex operations) | ||
| 139 | /// and each COP updates a physical register. Basically, on Jaguar, a YMM | ||
| 140 | /// register write effectively consumes two physical registers. That means, | ||
| 141 | /// the cost of a YMM write in the BtVer2 model is 2. | ||
| 142 | struct MCRegisterCostEntry { | ||
| 143 | unsigned RegisterClassID; | ||
| 144 | unsigned Cost; | ||
| 145 | bool AllowMoveElimination; | ||
| 146 | }; | ||
| 147 | |||
| 148 | /// A register file descriptor. | ||
| 149 | /// | ||
| 150 | /// This struct allows to describe processor register files. In particular, it | ||
| 151 | /// helps describing the size of the register file, as well as the cost of | ||
| 152 | /// allocating a register file at register renaming stage. | ||
| 153 | /// FIXME: this struct can be extended to provide information about the number | ||
| 154 | /// of read/write ports to the register file.  A value of zero for field | ||
| 155 | /// 'NumPhysRegs' means: this register file has an unbounded number of physical | ||
| 156 | /// registers. | ||
| 157 | struct MCRegisterFileDesc { | ||
| 158 | const char *Name; | ||
| 159 | uint16_t NumPhysRegs; | ||
| 160 | uint16_t NumRegisterCostEntries; | ||
| 161 |   // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. | ||
| 162 | uint16_t RegisterCostEntryIdx; | ||
| 163 |   // A value of zero means: there is no limit in the number of moves that can be | ||
| 164 |   // eliminated every cycle. | ||
| 165 | uint16_t MaxMovesEliminatedPerCycle; | ||
| 166 |   // Ture if this register file only knows how to optimize register moves from | ||
| 167 |   // known zero registers. | ||
| 168 | bool AllowZeroMoveEliminationOnly; | ||
| 169 | }; | ||
| 170 | |||
| 171 | /// Provide extra details about the machine processor. | ||
| 172 | /// | ||
| 173 | /// This is a collection of "optional" processor information that is not | ||
| 174 | /// normally used by the LLVM machine schedulers, but that can be consumed by | ||
| 175 | /// external tools like llvm-mca to improve the quality of the peformance | ||
| 176 | /// analysis. | ||
| 177 | struct MCExtraProcessorInfo { | ||
| 178 |   // Actual size of the reorder buffer in hardware. | ||
| 179 | unsigned ReorderBufferSize; | ||
| 180 |   // Number of instructions retired per cycle. | ||
| 181 | unsigned MaxRetirePerCycle; | ||
| 182 | const MCRegisterFileDesc *RegisterFiles; | ||
| 183 | unsigned NumRegisterFiles; | ||
| 184 | const MCRegisterCostEntry *RegisterCostTable; | ||
| 185 | unsigned NumRegisterCostEntries; | ||
| 186 | unsigned LoadQueueID; | ||
| 187 | unsigned StoreQueueID; | ||
| 188 | }; | ||
| 189 | |||
| 190 | /// Machine model for scheduling, bundling, and heuristics. | ||
| 191 | /// | ||
| 192 | /// The machine model directly provides basic information about the | ||
| 193 | /// microarchitecture to the scheduler in the form of properties. It also | ||
| 194 | /// optionally refers to scheduler resource tables and itinerary | ||
| 195 | /// tables. Scheduler resource tables model the latency and cost for each | ||
| 196 | /// instruction type. Itinerary tables are an independent mechanism that | ||
| 197 | /// provides a detailed reservation table describing each cycle of instruction | ||
| 198 | /// execution. Subtargets may define any or all of the above categories of data | ||
| 199 | /// depending on the type of CPU and selected scheduler. | ||
| 200 | /// | ||
| 201 | /// The machine independent properties defined here are used by the scheduler as | ||
| 202 | /// an abstract machine model. A real micro-architecture has a number of | ||
| 203 | /// buffers, queues, and stages. Declaring that a given machine-independent | ||
| 204 | /// abstract property corresponds to a specific physical property across all | ||
| 205 | /// subtargets can't be done. Nonetheless, the abstract model is | ||
| 206 | /// useful. Futhermore, subtargets typically extend this model with processor | ||
| 207 | /// specific resources to model any hardware features that can be exploited by | ||
| 208 | /// scheduling heuristics and aren't sufficiently represented in the abstract. | ||
| 209 | /// | ||
| 210 | /// The abstract pipeline is built around the notion of an "issue point". This | ||
| 211 | /// is merely a reference point for counting machine cycles. The physical | ||
| 212 | /// machine will have pipeline stages that delay execution. The scheduler does | ||
| 213 | /// not model those delays because they are irrelevant as long as they are | ||
| 214 | /// consistent. Inaccuracies arise when instructions have different execution | ||
| 215 | /// delays relative to each other, in addition to their intrinsic latency. Those | ||
| 216 | /// special cases can be handled by TableGen constructs such as, ReadAdvance, | ||
| 217 | /// which reduces latency when reading data, and ResourceCycles, which consumes | ||
| 218 | /// a processor resource when writing data for a number of abstract | ||
| 219 | /// cycles. | ||
| 220 | /// | ||
| 221 | /// TODO: One tool currently missing is the ability to add a delay to | ||
| 222 | /// ResourceCycles. That would be easy to add and would likely cover all cases | ||
| 223 | /// currently handled by the legacy itinerary tables. | ||
| 224 | /// | ||
| 225 | /// A note on out-of-order execution and, more generally, instruction | ||
| 226 | /// buffers. Part of the CPU pipeline is always in-order. The issue point, which | ||
| 227 | /// is the point of reference for counting cycles, only makes sense as an | ||
| 228 | /// in-order part of the pipeline. Other parts of the pipeline are sometimes | ||
| 229 | /// falling behind and sometimes catching up. It's only interesting to model | ||
| 230 | /// those other, decoupled parts of the pipeline if they may be predictably | ||
| 231 | /// resource constrained in a way that the scheduler can exploit. | ||
| 232 | /// | ||
| 233 | /// The LLVM machine model distinguishes between in-order constraints and | ||
| 234 | /// out-of-order constraints so that the target's scheduling strategy can apply | ||
| 235 | /// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order | ||
| 236 | /// resources would not typically be treated as a hard scheduling | ||
| 237 | /// constraint. For example, in the GenericScheduler, a delay caused by limited | ||
| 238 | /// out-of-order resources is not directly reflected in the number of cycles | ||
| 239 | /// that the scheduler sees between issuing an instruction and its dependent | ||
| 240 | /// instructions. In other words, out-of-order resources don't directly increase | ||
| 241 | /// the latency between pairs of instructions. However, they can still be used | ||
| 242 | /// to detect potential bottlenecks across a sequence of instructions and bias | ||
| 243 | /// the scheduling heuristics appropriately. | ||
| 244 | struct MCSchedModel { | ||
| 245 |   // IssueWidth is the maximum number of instructions that may be scheduled in | ||
| 246 |   // the same per-cycle group. This is meant to be a hard in-order constraint | ||
| 247 |   // (a.k.a. "hazard"). In the GenericScheduler strategy, no more than | ||
| 248 |   // IssueWidth micro-ops can ever be scheduled in a particular cycle. | ||
| 249 |   // | ||
| 250 |   // In practice, IssueWidth is useful to model any bottleneck between the | ||
| 251 |   // decoder (after micro-op expansion) and the out-of-order reservation | ||
| 252 |   // stations or the decoder bandwidth itself. If the total number of | ||
| 253 |   // reservation stations is also a bottleneck, or if any other pipeline stage | ||
| 254 |   // has a bandwidth limitation, then that can be naturally modeled by adding an | ||
| 255 |   // out-of-order processor resource. | ||
| 256 | unsigned IssueWidth; | ||
| 257 | static const unsigned DefaultIssueWidth = 1; | ||
| 258 | |||
| 259 |   // MicroOpBufferSize is the number of micro-ops that the processor may buffer | ||
| 260 |   // for out-of-order execution. | ||
| 261 |   // | ||
| 262 |   // "0" means operations that are not ready in this cycle are not considered | ||
| 263 |   // for scheduling (they go in the pending queue). Latency is paramount. This | ||
| 264 |   // may be more efficient if many instructions are pending in a schedule. | ||
| 265 |   // | ||
| 266 |   // "1" means all instructions are considered for scheduling regardless of | ||
| 267 |   // whether they are ready in this cycle. Latency still causes issue stalls, | ||
| 268 |   // but we balance those stalls against other heuristics. | ||
| 269 |   // | ||
| 270 |   // "> 1" means the processor is out-of-order. This is a machine independent | ||
| 271 |   // estimate of highly machine specific characteristics such as the register | ||
| 272 |   // renaming pool and reorder buffer. | ||
| 273 | unsigned MicroOpBufferSize; | ||
| 274 | static const unsigned DefaultMicroOpBufferSize = 0; | ||
| 275 | |||
| 276 |   // LoopMicroOpBufferSize is the number of micro-ops that the processor may | ||
| 277 |   // buffer for optimized loop execution. More generally, this represents the | ||
| 278 |   // optimal number of micro-ops in a loop body. A loop may be partially | ||
| 279 |   // unrolled to bring the count of micro-ops in the loop body closer to this | ||
| 280 |   // number. | ||
| 281 | unsigned LoopMicroOpBufferSize; | ||
| 282 | static const unsigned DefaultLoopMicroOpBufferSize = 0; | ||
| 283 | |||
| 284 |   // LoadLatency is the expected latency of load instructions. | ||
| 285 | unsigned LoadLatency; | ||
| 286 | static const unsigned DefaultLoadLatency = 4; | ||
| 287 | |||
| 288 |   // HighLatency is the expected latency of "very high latency" operations. | ||
| 289 |   // See TargetInstrInfo::isHighLatencyDef(). | ||
| 290 |   // By default, this is set to an arbitrarily high number of cycles | ||
| 291 |   // likely to have some impact on scheduling heuristics. | ||
| 292 | unsigned HighLatency; | ||
| 293 | static const unsigned DefaultHighLatency = 10; | ||
| 294 | |||
| 295 |   // MispredictPenalty is the typical number of extra cycles the processor | ||
| 296 |   // takes to recover from a branch misprediction. | ||
| 297 | unsigned MispredictPenalty; | ||
| 298 | static const unsigned DefaultMispredictPenalty = 10; | ||
| 299 | |||
| 300 | bool PostRAScheduler; // default value is false | ||
| 301 | |||
| 302 | bool CompleteModel; | ||
| 303 | |||
| 304 | unsigned ProcID; | ||
| 305 | const MCProcResourceDesc *ProcResourceTable; | ||
| 306 | const MCSchedClassDesc *SchedClassTable; | ||
| 307 | unsigned NumProcResourceKinds; | ||
| 308 | unsigned NumSchedClasses; | ||
| 309 |   // Instruction itinerary tables used by InstrItineraryData. | ||
| 310 | friend class InstrItineraryData; | ||
| 311 | const InstrItinerary *InstrItineraries; | ||
| 312 | |||
| 313 | const MCExtraProcessorInfo *ExtraProcessorInfo; | ||
| 314 | |||
| 315 | bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; } | ||
| 316 | |||
| 317 | unsigned getProcessorID() const { return ProcID; } | ||
| 318 | |||
| 319 |   /// Does this machine model include instruction-level scheduling. | ||
| 320 | bool hasInstrSchedModel() const { return SchedClassTable; } | ||
| 321 | |||
| 322 | const MCExtraProcessorInfo &getExtraProcessorInfo() const { | ||
| 323 | assert(hasExtraProcessorInfo() && | ||
| 324 | "No extra information available for this model"); | ||
| 325 | return *ExtraProcessorInfo; | ||
| 326 |   } | ||
| 327 | |||
| 328 |   /// Return true if this machine model data for all instructions with a | ||
| 329 |   /// scheduling class (itinerary class or SchedRW list). | ||
| 330 | bool isComplete() const { return CompleteModel; } | ||
| 331 | |||
| 332 |   /// Return true if machine supports out of order execution. | ||
| 333 | bool isOutOfOrder() const { return MicroOpBufferSize > 1; } | ||
| 334 | |||
| 335 | unsigned getNumProcResourceKinds() const { | ||
| 336 | return NumProcResourceKinds; | ||
| 337 |   } | ||
| 338 | |||
| 339 | const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const { | ||
| 340 | assert(hasInstrSchedModel() && "No scheduling machine model"); | ||
| 341 | |||
| 342 | assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx"); | ||
| 343 | return &ProcResourceTable[ProcResourceIdx]; | ||
| 344 |   } | ||
| 345 | |||
| 346 | const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const { | ||
| 347 | assert(hasInstrSchedModel() && "No scheduling machine model"); | ||
| 348 | |||
| 349 | assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx"); | ||
| 350 | return &SchedClassTable[SchedClassIdx]; | ||
| 351 |   } | ||
| 352 | |||
| 353 |   /// Returns the latency value for the scheduling class. | ||
| 354 | static int computeInstrLatency(const MCSubtargetInfo &STI, | ||
| 355 | const MCSchedClassDesc &SCDesc); | ||
| 356 | |||
| 357 | int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const; | ||
| 358 | int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, | ||
| 359 | const MCInst &Inst) const; | ||
| 360 | |||
| 361 |   // Returns the reciprocal throughput information from a MCSchedClassDesc. | ||
| 362 | static double | ||
| 363 | getReciprocalThroughput(const MCSubtargetInfo &STI, | ||
| 364 | const MCSchedClassDesc &SCDesc); | ||
| 365 | |||
| 366 | static double | ||
| 367 | getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID); | ||
| 368 | |||
| 369 |   double | ||
| 370 | getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII, | ||
| 371 | const MCInst &Inst) const; | ||
| 372 | |||
| 373 |   /// Returns the maximum forwarding delay for register reads dependent on | ||
| 374 |   /// writes of scheduling class WriteResourceIdx. | ||
| 375 | static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries, | ||
| 376 | unsigned WriteResourceIdx = 0); | ||
| 377 | |||
| 378 |   /// Returns the default initialized model. | ||
| 379 | static const MCSchedModel &GetDefaultSchedModel() { return Default; } | ||
| 380 | static const MCSchedModel Default; | ||
| 381 | }; | ||
| 382 | |||
| 383 | } // namespace llvm | ||
| 384 | |||
| 385 | #endif |