Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===- ModuloSchedule.h - Software pipeline schedule expansion ------------===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | // | ||
| 9 | // Software pipelining (SWP) is an instruction scheduling technique for loops | ||
| 10 | // that overlaps loop iterations and exploits ILP via compiler transformations. | ||
| 11 | // | ||
| 12 | // There are multiple methods for analyzing a loop and creating a schedule. | ||
| 13 | // An example algorithm is Swing Modulo Scheduling (implemented by the | ||
| 14 | // MachinePipeliner). The details of how a schedule is arrived at are irrelevant | ||
| 15 | // for the task of actually rewriting a loop to adhere to the schedule, which | ||
| 16 | // is what this file does. | ||
| 17 | // | ||
| 18 | // A schedule is, for every instruction in a block, a Cycle and a Stage. Note | ||
| 19 | // that we only support single-block loops, so "block" and "loop" can be used | ||
| 20 | // interchangably. | ||
| 21 | // | ||
| 22 | // The Cycle of an instruction defines a partial order of the instructions in | ||
| 23 | // the remapped loop. Instructions within a cycle must not consume the output | ||
| 24 | // of any instruction in the same cycle. Cycle information is assumed to have | ||
| 25 | // been calculated such that the processor will execute instructions in | ||
| 26 | // lock-step (for example in a VLIW ISA). | ||
| 27 | // | ||
| 28 | // The Stage of an instruction defines the mapping between logical loop | ||
| 29 | // iterations and pipelined loop iterations. An example (unrolled) pipeline | ||
| 30 | // may look something like: | ||
| 31 | // | ||
| 32 | //  I0[0]                      Execute instruction I0 of iteration 0 | ||
| 33 | //  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1 | ||
| 34 | //         I1[1], I0[2] | ||
| 35 | //                I1[2], I0[3] | ||
| 36 | // | ||
| 37 | // In the schedule for this unrolled sequence we would say that I0 was scheduled | ||
| 38 | // in stage 0 and I1 in stage 1: | ||
| 39 | // | ||
| 40 | //  loop: | ||
| 41 | //    [stage 0] x = I0 | ||
| 42 | //    [stage 1] I1 x (from stage 0) | ||
| 43 | // | ||
| 44 | // And to actually generate valid code we must insert a phi: | ||
| 45 | // | ||
| 46 | //  loop: | ||
| 47 | //    x' = phi(x) | ||
| 48 | //    x = I0 | ||
| 49 | //    I1 x' | ||
| 50 | // | ||
| 51 | // This is a simple example; the rules for how to generate correct code given | ||
| 52 | // an arbitrary schedule containing loop-carried values are complex. | ||
| 53 | // | ||
| 54 | // Note that these examples only mention the steady-state kernel of the | ||
| 55 | // generated loop; prologs and epilogs must be generated also that prime and | ||
| 56 | // flush the pipeline. Doing so is nontrivial. | ||
| 57 | // | ||
| 58 | //===----------------------------------------------------------------------===// | ||
| 59 | |||
| 60 | #ifndef LLVM_CODEGEN_MODULOSCHEDULE_H | ||
| 61 | #define LLVM_CODEGEN_MODULOSCHEDULE_H | ||
| 62 | |||
| 63 | #include "llvm/CodeGen/MachineFunction.h" | ||
| 64 | #include "llvm/CodeGen/MachineLoopUtils.h" | ||
| 65 | #include "llvm/CodeGen/TargetInstrInfo.h" | ||
| 66 | #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||
| 67 | #include <deque> | ||
| 68 | #include <vector> | ||
| 69 | |||
| 70 | namespace llvm { | ||
| 71 | class MachineBasicBlock; | ||
| 72 | class MachineLoop; | ||
| 73 | class MachineRegisterInfo; | ||
| 74 | class MachineInstr; | ||
| 75 | class LiveIntervals; | ||
| 76 | |||
| 77 | /// Represents a schedule for a single-block loop. For every instruction we | ||
| 78 | /// maintain a Cycle and Stage. | ||
| 79 | class ModuloSchedule { | ||
| 80 | private: | ||
| 81 |   /// The block containing the loop instructions. | ||
| 82 | MachineLoop *Loop; | ||
| 83 | |||
| 84 |   /// The instructions to be generated, in total order. Cycle provides a partial | ||
| 85 |   /// order; the total order within cycles has been decided by the schedule | ||
| 86 |   /// producer. | ||
| 87 | std::vector<MachineInstr *> ScheduledInstrs; | ||
| 88 | |||
| 89 |   /// The cycle for each instruction. | ||
| 90 | DenseMap<MachineInstr *, int> Cycle; | ||
| 91 | |||
| 92 |   /// The stage for each instruction. | ||
| 93 | DenseMap<MachineInstr *, int> Stage; | ||
| 94 | |||
| 95 |   /// The number of stages in this schedule (Max(Stage) + 1). | ||
| 96 | int NumStages; | ||
| 97 | |||
| 98 | public: | ||
| 99 |   /// Create a new ModuloSchedule. | ||
| 100 |   /// \arg ScheduledInstrs The new loop instructions, in total resequenced | ||
| 101 |   ///    order. | ||
| 102 |   /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does | ||
| 103 |   ///    not need to start at zero. ScheduledInstrs must be partially ordered by | ||
| 104 |   ///    Cycle. | ||
| 105 |   /// \arg Stage Stage index for all instructions in ScheduleInstrs. | ||
| 106 | ModuloSchedule(MachineFunction &MF, MachineLoop *Loop, | ||
| 107 | std::vector<MachineInstr *> ScheduledInstrs, | ||
| 108 | DenseMap<MachineInstr *, int> Cycle, | ||
| 109 | DenseMap<MachineInstr *, int> Stage) | ||
| 110 | : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)), | ||
| 111 | Stage(std::move(Stage)) { | ||
| 112 | NumStages = 0; | ||
| 113 | for (auto &KV : this->Stage) | ||
| 114 | NumStages = std::max(NumStages, KV.second); | ||
| 115 | ++NumStages; | ||
| 116 |   } | ||
| 117 | |||
| 118 |   /// Return the single-block loop being scheduled. | ||
| 119 | MachineLoop *getLoop() const { return Loop; } | ||
| 120 | |||
| 121 |   /// Return the number of stages contained in this schedule, which is the | ||
| 122 |   /// largest stage index + 1. | ||
| 123 | int getNumStages() const { return NumStages; } | ||
| 124 | |||
| 125 |   /// Return the first cycle in the schedule, which is the cycle index of the | ||
| 126 |   /// first instruction. | ||
| 127 | int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; } | ||
| 128 | |||
| 129 |   /// Return the final cycle in the schedule, which is the cycle index of the | ||
| 130 |   /// last instruction. | ||
| 131 | int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; } | ||
| 132 | |||
| 133 |   /// Return the stage that MI is scheduled in, or -1. | ||
| 134 | int getStage(MachineInstr *MI) { | ||
| 135 | auto I = Stage.find(MI); | ||
| 136 | return I == Stage.end() ? -1 : I->second; | ||
| 137 |   } | ||
| 138 | |||
| 139 |   /// Return the cycle that MI is scheduled at, or -1. | ||
| 140 | int getCycle(MachineInstr *MI) { | ||
| 141 | auto I = Cycle.find(MI); | ||
| 142 | return I == Cycle.end() ? -1 : I->second; | ||
| 143 |   } | ||
| 144 | |||
| 145 |   /// Set the stage of a newly created instruction. | ||
| 146 | void setStage(MachineInstr *MI, int MIStage) { | ||
| 147 | assert(Stage.count(MI) == 0); | ||
| 148 | Stage[MI] = MIStage; | ||
| 149 |   } | ||
| 150 | |||
| 151 |   /// Return the rescheduled instructions in order. | ||
| 152 | ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; } | ||
| 153 | |||
| 154 | void dump() { print(dbgs()); } | ||
| 155 | void print(raw_ostream &OS); | ||
| 156 | }; | ||
| 157 | |||
| 158 | /// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, | ||
| 159 | /// rewriting the old loop and inserting prologs and epilogs as required. | ||
| 160 | class ModuloScheduleExpander { | ||
| 161 | public: | ||
| 162 | using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>; | ||
| 163 | |||
| 164 | private: | ||
| 165 | using ValueMapTy = DenseMap<unsigned, unsigned>; | ||
| 166 | using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>; | ||
| 167 | using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>; | ||
| 168 | |||
| 169 | ModuloSchedule &Schedule; | ||
| 170 | MachineFunction &MF; | ||
| 171 | const TargetSubtargetInfo &ST; | ||
| 172 | MachineRegisterInfo &MRI; | ||
| 173 | const TargetInstrInfo *TII; | ||
| 174 | LiveIntervals &LIS; | ||
| 175 | |||
| 176 | MachineBasicBlock *BB; | ||
| 177 | MachineBasicBlock *Preheader; | ||
| 178 | MachineBasicBlock *NewKernel = nullptr; | ||
| 179 | std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; | ||
| 180 | |||
| 181 |   /// Map for each register and the max difference between its uses and def. | ||
| 182 |   /// The first element in the pair is the max difference in stages. The | ||
| 183 |   /// second is true if the register defines a Phi value and loop value is | ||
| 184 |   /// scheduled before the Phi. | ||
| 185 | std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff; | ||
| 186 | |||
| 187 |   /// Instructions to change when emitting the final schedule. | ||
| 188 |   InstrChangesTy InstrChanges; | ||
| 189 | |||
| 190 | void generatePipelinedLoop(); | ||
| 191 | void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB, | ||
| 192 | ValueMapTy *VRMap, MBBVectorTy &PrologBBs); | ||
| 193 | void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB, | ||
| 194 | MachineBasicBlock *OrigBB, ValueMapTy *VRMap, | ||
| 195 | ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs, | ||
| 196 | MBBVectorTy &PrologBBs); | ||
| 197 | void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, | ||
| 198 | MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, | ||
| 199 | ValueMapTy *VRMap, InstrMapTy &InstrMap, | ||
| 200 | unsigned LastStageNum, unsigned CurStageNum, | ||
| 201 | bool IsLast); | ||
| 202 | void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1, | ||
| 203 | MachineBasicBlock *BB2, MachineBasicBlock *KernelBB, | ||
| 204 | ValueMapTy *VRMap, ValueMapTy *VRMapPhi, | ||
| 205 | InstrMapTy &InstrMap, unsigned LastStageNum, | ||
| 206 | unsigned CurStageNum, bool IsLast); | ||
| 207 | void removeDeadInstructions(MachineBasicBlock *KernelBB, | ||
| 208 | MBBVectorTy &EpilogBBs); | ||
| 209 | void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs); | ||
| 210 | void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs, | ||
| 211 | MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, | ||
| 212 | ValueMapTy *VRMap); | ||
| 213 | bool computeDelta(MachineInstr &MI, unsigned &Delta); | ||
| 214 | void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, | ||
| 215 | unsigned Num); | ||
| 216 | MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum, | ||
| 217 | unsigned InstStageNum); | ||
| 218 | MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum, | ||
| 219 | unsigned InstStageNum); | ||
| 220 | void updateInstruction(MachineInstr *NewMI, bool LastDef, | ||
| 221 | unsigned CurStageNum, unsigned InstrStageNum, | ||
| 222 | ValueMapTy *VRMap); | ||
| 223 | MachineInstr *findDefInLoop(unsigned Reg); | ||
| 224 | unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal, | ||
| 225 | unsigned LoopStage, ValueMapTy *VRMap, | ||
| 226 | MachineBasicBlock *BB); | ||
| 227 | void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum, | ||
| 228 | ValueMapTy *VRMap, InstrMapTy &InstrMap); | ||
| 229 | void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap, | ||
| 230 | unsigned CurStageNum, unsigned PhiNum, | ||
| 231 | MachineInstr *Phi, unsigned OldReg, | ||
| 232 | unsigned NewReg, unsigned PrevReg = 0); | ||
| 233 | bool isLoopCarried(MachineInstr &Phi); | ||
| 234 | |||
| 235 |   /// Return the max. number of stages/iterations that can occur between a | ||
| 236 |   /// register definition and its uses. | ||
| 237 | unsigned getStagesForReg(int Reg, unsigned CurStage) { | ||
| 238 | std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; | ||
| 239 | if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 && | ||
| 240 | Stages.second) | ||
| 241 | return 1; | ||
| 242 | return Stages.first; | ||
| 243 |   } | ||
| 244 | |||
| 245 |   /// The number of stages for a Phi is a little different than other | ||
| 246 |   /// instructions. The minimum value computed in RegToStageDiff is 1 | ||
| 247 |   /// because we assume the Phi is needed for at least 1 iteration. | ||
| 248 |   /// This is not the case if the loop value is scheduled prior to the | ||
| 249 |   /// Phi in the same stage.  This function returns the number of stages | ||
| 250 |   /// or iterations needed between the Phi definition and any uses. | ||
| 251 | unsigned getStagesForPhi(int Reg) { | ||
| 252 | std::pair<unsigned, bool> Stages = RegToStageDiff[Reg]; | ||
| 253 | if (Stages.second) | ||
| 254 | return Stages.first; | ||
| 255 | return Stages.first - 1; | ||
| 256 |   } | ||
| 257 | |||
| 258 | public: | ||
| 259 |   /// Create a new ModuloScheduleExpander. | ||
| 260 |   /// \arg InstrChanges Modifications to make to instructions with memory | ||
| 261 |   ///   operands. | ||
| 262 |   /// FIXME: InstrChanges is opaque and is an implementation detail of an | ||
| 263 |   ///   optimization in MachinePipeliner that crosses abstraction boundaries. | ||
| 264 | ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, | ||
| 265 | LiveIntervals &LIS, InstrChangesTy InstrChanges) | ||
| 266 | : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), | ||
| 267 | TII(ST.getInstrInfo()), LIS(LIS), | ||
| 268 | InstrChanges(std::move(InstrChanges)) {} | ||
| 269 | |||
| 270 |   /// Performs the actual expansion. | ||
| 271 | void expand(); | ||
| 272 |   /// Performs final cleanup after expansion. | ||
| 273 | void cleanup(); | ||
| 274 | |||
| 275 |   /// Returns the newly rewritten kernel block, or nullptr if this was | ||
| 276 |   /// optimized away. | ||
| 277 | MachineBasicBlock *getRewrittenKernel() { return NewKernel; } | ||
| 278 | }; | ||
| 279 | |||
| 280 | /// A reimplementation of ModuloScheduleExpander. It works by generating a | ||
| 281 | /// standalone kernel loop and peeling out the prologs and epilogs. | ||
| 282 | class PeelingModuloScheduleExpander { | ||
| 283 | public: | ||
| 284 | PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S, | ||
| 285 | LiveIntervals *LIS) | ||
| 286 | : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()), | ||
| 287 | TII(ST.getInstrInfo()), LIS(LIS) {} | ||
| 288 | |||
| 289 | void expand(); | ||
| 290 | |||
| 291 |   /// Runs ModuloScheduleExpander and treats it as a golden input to validate | ||
| 292 |   /// aspects of the code generated by PeelingModuloScheduleExpander. | ||
| 293 | void validateAgainstModuloScheduleExpander(); | ||
| 294 | |||
| 295 | protected: | ||
| 296 | ModuloSchedule &Schedule; | ||
| 297 | MachineFunction &MF; | ||
| 298 | const TargetSubtargetInfo &ST; | ||
| 299 | MachineRegisterInfo &MRI; | ||
| 300 | const TargetInstrInfo *TII; | ||
| 301 | LiveIntervals *LIS; | ||
| 302 | |||
| 303 |   /// The original loop block that gets rewritten in-place. | ||
| 304 | MachineBasicBlock *BB; | ||
| 305 |   /// The original loop preheader. | ||
| 306 | MachineBasicBlock *Preheader; | ||
| 307 |   /// All prolog and epilog blocks. | ||
| 308 | SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs; | ||
| 309 |   /// For every block, the stages that are produced. | ||
| 310 | DenseMap<MachineBasicBlock *, BitVector> LiveStages; | ||
| 311 |   /// For every block, the stages that are available. A stage can be available | ||
| 312 |   /// but not produced (in the epilog) or produced but not available (in the | ||
| 313 |   /// prolog). | ||
| 314 | DenseMap<MachineBasicBlock *, BitVector> AvailableStages; | ||
| 315 |   /// When peeling the epilogue keep track of the distance between the phi | ||
| 316 |   /// nodes and the kernel. | ||
| 317 | DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration; | ||
| 318 | |||
| 319 |   /// CanonicalMIs and BlockMIs form a bidirectional map between any of the | ||
| 320 |   /// loop kernel clones. | ||
| 321 | DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs; | ||
| 322 | DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *> | ||
| 323 |       BlockMIs; | ||
| 324 | |||
| 325 |   /// State passed from peelKernel to peelPrologAndEpilogs(). | ||
| 326 | std::deque<MachineBasicBlock *> PeeledFront, PeeledBack; | ||
| 327 |   /// Illegal phis that need to be deleted once we re-link stages. | ||
| 328 | SmallVector<MachineInstr *, 4> IllegalPhisToDelete; | ||
| 329 | |||
| 330 |   /// Converts BB from the original loop body to the rewritten, pipelined | ||
| 331 |   /// steady-state. | ||
| 332 | void rewriteKernel(); | ||
| 333 | |||
| 334 |   /// Peels one iteration of the rewritten kernel (BB) in the specified | ||
| 335 |   /// direction. | ||
| 336 | MachineBasicBlock *peelKernel(LoopPeelDirection LPD); | ||
| 337 |   // Delete instructions whose stage is less than MinStage in the given basic | ||
| 338 |   // block. | ||
| 339 | void filterInstructions(MachineBasicBlock *MB, int MinStage); | ||
| 340 |   // Move instructions of the given stage from sourceBB to DestBB. Remap the phi | ||
| 341 |   // instructions to keep a valid IR. | ||
| 342 | void moveStageBetweenBlocks(MachineBasicBlock *DestBB, | ||
| 343 | MachineBasicBlock *SourceBB, unsigned Stage); | ||
| 344 |   /// Peel the kernel forwards and backwards to produce prologs and epilogs, | ||
| 345 |   /// and stitch them together. | ||
| 346 | void peelPrologAndEpilogs(); | ||
| 347 |   /// All prolog and epilog blocks are clones of the kernel, so any produced | ||
| 348 |   /// register in one block has an corollary in all other blocks. | ||
| 349 | Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB); | ||
| 350 |   /// Change all users of MI, if MI is predicated out | ||
| 351 |   /// (LiveStages[MI->getParent()] == false). | ||
| 352 | void rewriteUsesOf(MachineInstr *MI); | ||
| 353 |   /// Insert branches between prologs, kernel and epilogs. | ||
| 354 | void fixupBranches(); | ||
| 355 |   /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block | ||
| 356 |   /// to a block dominated by all prologs and epilogs. This allows us to treat | ||
| 357 |   /// the loop exiting block as any other kernel clone. | ||
| 358 | MachineBasicBlock *CreateLCSSAExitingBlock(); | ||
| 359 |   /// Helper to get the stage of an instruction in the schedule. | ||
| 360 | unsigned getStage(MachineInstr *MI) { | ||
| 361 | if (CanonicalMIs.count(MI)) | ||
| 362 | MI = CanonicalMIs[MI]; | ||
| 363 | return Schedule.getStage(MI); | ||
| 364 |   } | ||
| 365 |   /// Helper function to find the right canonical register for a phi instruction | ||
| 366 |   /// coming from a peeled out prologue. | ||
| 367 | Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi); | ||
| 368 |   /// Target loop info before kernel peeling. | ||
| 369 | std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo; | ||
| 370 | }; | ||
| 371 | |||
| 372 | /// Expander that simply annotates each scheduled instruction with a post-instr | ||
| 373 | /// symbol that can be consumed by the ModuloScheduleTest pass. | ||
| 374 | /// | ||
| 375 | /// The post-instr symbol is a way of annotating an instruction that can be | ||
| 376 | /// roundtripped in MIR. The syntax is: | ||
| 377 | ///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5> | ||
| 378 | class ModuloScheduleTestAnnotater { | ||
| 379 | MachineFunction &MF; | ||
| 380 | ModuloSchedule &S; | ||
| 381 | |||
| 382 | public: | ||
| 383 | ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S) | ||
| 384 | : MF(MF), S(S) {} | ||
| 385 | |||
| 386 |   /// Performs the annotation. | ||
| 387 | void annotate(); | ||
| 388 | }; | ||
| 389 | |||
| 390 | } // end namespace llvm | ||
| 391 | |||
| 392 | #endif // LLVM_CODEGEN_MODULOSCHEDULE_H |