//===- ModuloSchedule.h - Software pipeline schedule expansion ------------===//
 
//
 
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 
// See https://llvm.org/LICENSE.txt for license information.
 
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
//
 
//===----------------------------------------------------------------------===//
 
//
 
// Software pipelining (SWP) is an instruction scheduling technique for loops
 
// that overlaps loop iterations and exploits ILP via compiler transformations.
 
//
 
// There are multiple methods for analyzing a loop and creating a schedule.
 
// An example algorithm is Swing Modulo Scheduling (implemented by the
 
// MachinePipeliner). The details of how a schedule is arrived at are irrelevant
 
// for the task of actually rewriting a loop to adhere to the schedule, which
 
// is what this file does.
 
//
 
// A schedule is, for every instruction in a block, a Cycle and a Stage. Note
 
// that we only support single-block loops, so "block" and "loop" can be used
 
// interchangably.
 
//
 
// The Cycle of an instruction defines a partial order of the instructions in
 
// the remapped loop. Instructions within a cycle must not consume the output
 
// of any instruction in the same cycle. Cycle information is assumed to have
 
// been calculated such that the processor will execute instructions in
 
// lock-step (for example in a VLIW ISA).
 
//
 
// The Stage of an instruction defines the mapping between logical loop
 
// iterations and pipelined loop iterations. An example (unrolled) pipeline
 
// may look something like:
 
//
 
//  I0[0]                      Execute instruction I0 of iteration 0
 
//  I1[0], I0[1]               Execute I0 of iteration 1 and I1 of iteration 1
 
//         I1[1], I0[2]
 
//                I1[2], I0[3]
 
//
 
// In the schedule for this unrolled sequence we would say that I0 was scheduled
 
// in stage 0 and I1 in stage 1:
 
//
 
//  loop:
 
//    [stage 0] x = I0
 
//    [stage 1] I1 x (from stage 0)
 
//
 
// And to actually generate valid code we must insert a phi:
 
//
 
//  loop:
 
//    x' = phi(x)
 
//    x = I0
 
//    I1 x'
 
//
 
// This is a simple example; the rules for how to generate correct code given
 
// an arbitrary schedule containing loop-carried values are complex.
 
//
 
// Note that these examples only mention the steady-state kernel of the
 
// generated loop; prologs and epilogs must be generated also that prime and
 
// flush the pipeline. Doing so is nontrivial.
 
//
 
//===----------------------------------------------------------------------===//
 
 
 
#ifndef LLVM_CODEGEN_MODULOSCHEDULE_H
 
#define LLVM_CODEGEN_MODULOSCHEDULE_H
 
 
 
#include "llvm/CodeGen/MachineFunction.h"
 
#include "llvm/CodeGen/MachineLoopUtils.h"
 
#include "llvm/CodeGen/TargetInstrInfo.h"
 
#include "llvm/CodeGen/TargetSubtargetInfo.h"
 
#include <deque>
 
#include <vector>
 
 
 
namespace llvm {
 
class MachineBasicBlock;
 
class MachineLoop;
 
class MachineRegisterInfo;
 
class MachineInstr;
 
class LiveIntervals;
 
 
 
/// Represents a schedule for a single-block loop. For every instruction we
 
/// maintain a Cycle and Stage.
 
class ModuloSchedule {
 
private:
 
  /// The block containing the loop instructions.
 
  MachineLoop *Loop;
 
 
 
  /// The instructions to be generated, in total order. Cycle provides a partial
 
  /// order; the total order within cycles has been decided by the schedule
 
  /// producer.
 
  std::vector<MachineInstr *> ScheduledInstrs;
 
 
 
  /// The cycle for each instruction.
 
  DenseMap<MachineInstr *, int> Cycle;
 
 
 
  /// The stage for each instruction.
 
  DenseMap<MachineInstr *, int> Stage;
 
 
 
  /// The number of stages in this schedule (Max(Stage) + 1).
 
  int NumStages;
 
 
 
public:
 
  /// Create a new ModuloSchedule.
 
  /// \arg ScheduledInstrs The new loop instructions, in total resequenced
 
  ///    order.
 
  /// \arg Cycle Cycle index for all instructions in ScheduledInstrs. Cycle does
 
  ///    not need to start at zero. ScheduledInstrs must be partially ordered by
 
  ///    Cycle.
 
  /// \arg Stage Stage index for all instructions in ScheduleInstrs.
 
  ModuloSchedule(MachineFunction &MF, MachineLoop *Loop,
 
                 std::vector<MachineInstr *> ScheduledInstrs,
 
                 DenseMap<MachineInstr *, int> Cycle,
 
                 DenseMap<MachineInstr *, int> Stage)
 
      : Loop(Loop), ScheduledInstrs(ScheduledInstrs), Cycle(std::move(Cycle)),
 
        Stage(std::move(Stage)) {
 
    NumStages = 0;
 
    for (auto &KV : this->Stage)
 
      NumStages = std::max(NumStages, KV.second);
 
    ++NumStages;
 
  }
 
 
 
  /// Return the single-block loop being scheduled.
 
  MachineLoop *getLoop() const { return Loop; }
 
 
 
  /// Return the number of stages contained in this schedule, which is the
 
  /// largest stage index + 1.
 
  int getNumStages() const { return NumStages; }
 
 
 
  /// Return the first cycle in the schedule, which is the cycle index of the
 
  /// first instruction.
 
  int getFirstCycle() { return Cycle[ScheduledInstrs.front()]; }
 
 
 
  /// Return the final cycle in the schedule, which is the cycle index of the
 
  /// last instruction.
 
  int getFinalCycle() { return Cycle[ScheduledInstrs.back()]; }
 
 
 
  /// Return the stage that MI is scheduled in, or -1.
 
  int getStage(MachineInstr *MI) {
 
    auto I = Stage.find(MI);
 
    return I == Stage.end() ? -1 : I->second;
 
  }
 
 
 
  /// Return the cycle that MI is scheduled at, or -1.
 
  int getCycle(MachineInstr *MI) {
 
    auto I = Cycle.find(MI);
 
    return I == Cycle.end() ? -1 : I->second;
 
  }
 
 
 
  /// Set the stage of a newly created instruction.
 
  void setStage(MachineInstr *MI, int MIStage) {
 
    assert(Stage.count(MI) == 0);
 
    Stage[MI] = MIStage;
 
  }
 
 
 
  /// Return the rescheduled instructions in order.
 
  ArrayRef<MachineInstr *> getInstructions() { return ScheduledInstrs; }
 
 
 
  void dump() { print(dbgs()); }
 
  void print(raw_ostream &OS);
 
};
 
 
 
/// The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place,
 
/// rewriting the old loop and inserting prologs and epilogs as required.
 
class ModuloScheduleExpander {
 
public:
 
  using InstrChangesTy = DenseMap<MachineInstr *, std::pair<unsigned, int64_t>>;
 
 
 
private:
 
  using ValueMapTy = DenseMap<unsigned, unsigned>;
 
  using MBBVectorTy = SmallVectorImpl<MachineBasicBlock *>;
 
  using InstrMapTy = DenseMap<MachineInstr *, MachineInstr *>;
 
 
 
  ModuloSchedule &Schedule;
 
  MachineFunction &MF;
 
  const TargetSubtargetInfo &ST;
 
  MachineRegisterInfo &MRI;
 
  const TargetInstrInfo *TII;
 
  LiveIntervals &LIS;
 
 
 
  MachineBasicBlock *BB;
 
  MachineBasicBlock *Preheader;
 
  MachineBasicBlock *NewKernel = nullptr;
 
  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
 
 
 
  /// Map for each register and the max difference between its uses and def.
 
  /// The first element in the pair is the max difference in stages. The
 
  /// second is true if the register defines a Phi value and loop value is
 
  /// scheduled before the Phi.
 
  std::map<unsigned, std::pair<unsigned, bool>> RegToStageDiff;
 
 
 
  /// Instructions to change when emitting the final schedule.
 
  InstrChangesTy InstrChanges;
 
 
 
  void generatePipelinedLoop();
 
  void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB,
 
                      ValueMapTy *VRMap, MBBVectorTy &PrologBBs);
 
  void generateEpilog(unsigned LastStage, MachineBasicBlock *KernelBB,
 
                      MachineBasicBlock *OrigBB, ValueMapTy *VRMap,
 
                      ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
 
                      MBBVectorTy &PrologBBs);
 
  void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
 
                            MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
 
                            ValueMapTy *VRMap, InstrMapTy &InstrMap,
 
                            unsigned LastStageNum, unsigned CurStageNum,
 
                            bool IsLast);
 
  void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
 
                    MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
 
                    ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
 
                    InstrMapTy &InstrMap, unsigned LastStageNum,
 
                    unsigned CurStageNum, bool IsLast);
 
  void removeDeadInstructions(MachineBasicBlock *KernelBB,
 
                              MBBVectorTy &EpilogBBs);
 
  void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs);
 
  void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs,
 
                   MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
 
                   ValueMapTy *VRMap);
 
  bool computeDelta(MachineInstr &MI, unsigned &Delta);
 
  void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
 
                         unsigned Num);
 
  MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
 
                           unsigned InstStageNum);
 
  MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
 
                                    unsigned InstStageNum);
 
  void updateInstruction(MachineInstr *NewMI, bool LastDef,
 
                         unsigned CurStageNum, unsigned InstrStageNum,
 
                         ValueMapTy *VRMap);
 
  MachineInstr *findDefInLoop(unsigned Reg);
 
  unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
 
                         unsigned LoopStage, ValueMapTy *VRMap,
 
                         MachineBasicBlock *BB);
 
  void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
 
                        ValueMapTy *VRMap, InstrMapTy &InstrMap);
 
  void rewriteScheduledInstr(MachineBasicBlock *BB, InstrMapTy &InstrMap,
 
                             unsigned CurStageNum, unsigned PhiNum,
 
                             MachineInstr *Phi, unsigned OldReg,
 
                             unsigned NewReg, unsigned PrevReg = 0);
 
  bool isLoopCarried(MachineInstr &Phi);
 
 
 
  /// Return the max. number of stages/iterations that can occur between a
 
  /// register definition and its uses.
 
  unsigned getStagesForReg(int Reg, unsigned CurStage) {
 
    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
 
    if ((int)CurStage > Schedule.getNumStages() - 1 && Stages.first == 0 &&
 
        Stages.second)
 
      return 1;
 
    return Stages.first;
 
  }
 
 
 
  /// The number of stages for a Phi is a little different than other
 
  /// instructions. The minimum value computed in RegToStageDiff is 1
 
  /// because we assume the Phi is needed for at least 1 iteration.
 
  /// This is not the case if the loop value is scheduled prior to the
 
  /// Phi in the same stage.  This function returns the number of stages
 
  /// or iterations needed between the Phi definition and any uses.
 
  unsigned getStagesForPhi(int Reg) {
 
    std::pair<unsigned, bool> Stages = RegToStageDiff[Reg];
 
    if (Stages.second)
 
      return Stages.first;
 
    return Stages.first - 1;
 
  }
 
 
 
public:
 
  /// Create a new ModuloScheduleExpander.
 
  /// \arg InstrChanges Modifications to make to instructions with memory
 
  ///   operands.
 
  /// FIXME: InstrChanges is opaque and is an implementation detail of an
 
  ///   optimization in MachinePipeliner that crosses abstraction boundaries.
 
  ModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
 
                         LiveIntervals &LIS, InstrChangesTy InstrChanges)
 
      : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
 
        TII(ST.getInstrInfo()), LIS(LIS),
 
        InstrChanges(std::move(InstrChanges)) {}
 
 
 
  /// Performs the actual expansion.
 
  void expand();
 
  /// Performs final cleanup after expansion.
 
  void cleanup();
 
 
 
  /// Returns the newly rewritten kernel block, or nullptr if this was
 
  /// optimized away.
 
  MachineBasicBlock *getRewrittenKernel() { return NewKernel; }
 
};
 
 
 
/// A reimplementation of ModuloScheduleExpander. It works by generating a
 
/// standalone kernel loop and peeling out the prologs and epilogs.
 
class PeelingModuloScheduleExpander {
 
public:
 
  PeelingModuloScheduleExpander(MachineFunction &MF, ModuloSchedule &S,
 
                                LiveIntervals *LIS)
 
      : Schedule(S), MF(MF), ST(MF.getSubtarget()), MRI(MF.getRegInfo()),
 
        TII(ST.getInstrInfo()), LIS(LIS) {}
 
 
 
  void expand();
 
 
 
  /// Runs ModuloScheduleExpander and treats it as a golden input to validate
 
  /// aspects of the code generated by PeelingModuloScheduleExpander.
 
  void validateAgainstModuloScheduleExpander();
 
 
 
protected:
 
  ModuloSchedule &Schedule;
 
  MachineFunction &MF;
 
  const TargetSubtargetInfo &ST;
 
  MachineRegisterInfo &MRI;
 
  const TargetInstrInfo *TII;
 
  LiveIntervals *LIS;
 
 
 
  /// The original loop block that gets rewritten in-place.
 
  MachineBasicBlock *BB;
 
  /// The original loop preheader.
 
  MachineBasicBlock *Preheader;
 
  /// All prolog and epilog blocks.
 
  SmallVector<MachineBasicBlock *, 4> Prologs, Epilogs;
 
  /// For every block, the stages that are produced.
 
  DenseMap<MachineBasicBlock *, BitVector> LiveStages;
 
  /// For every block, the stages that are available. A stage can be available
 
  /// but not produced (in the epilog) or produced but not available (in the
 
  /// prolog).
 
  DenseMap<MachineBasicBlock *, BitVector> AvailableStages;
 
  /// When peeling the epilogue keep track of the distance between the phi
 
  /// nodes and the kernel.
 
  DenseMap<MachineInstr *, unsigned> PhiNodeLoopIteration;
 
 
 
  /// CanonicalMIs and BlockMIs form a bidirectional map between any of the
 
  /// loop kernel clones.
 
  DenseMap<MachineInstr *, MachineInstr *> CanonicalMIs;
 
  DenseMap<std::pair<MachineBasicBlock *, MachineInstr *>, MachineInstr *>
 
      BlockMIs;
 
 
 
  /// State passed from peelKernel to peelPrologAndEpilogs().
 
  std::deque<MachineBasicBlock *> PeeledFront, PeeledBack;
 
  /// Illegal phis that need to be deleted once we re-link stages.
 
  SmallVector<MachineInstr *, 4> IllegalPhisToDelete;
 
 
 
  /// Converts BB from the original loop body to the rewritten, pipelined
 
  /// steady-state.
 
  void rewriteKernel();
 
 
 
  /// Peels one iteration of the rewritten kernel (BB) in the specified
 
  /// direction.
 
  MachineBasicBlock *peelKernel(LoopPeelDirection LPD);
 
  // Delete instructions whose stage is less than MinStage in the given basic
 
  // block.
 
  void filterInstructions(MachineBasicBlock *MB, int MinStage);
 
  // Move instructions of the given stage from sourceBB to DestBB. Remap the phi
 
  // instructions to keep a valid IR.
 
  void moveStageBetweenBlocks(MachineBasicBlock *DestBB,
 
                              MachineBasicBlock *SourceBB, unsigned Stage);
 
  /// Peel the kernel forwards and backwards to produce prologs and epilogs,
 
  /// and stitch them together.
 
  void peelPrologAndEpilogs();
 
  /// All prolog and epilog blocks are clones of the kernel, so any produced
 
  /// register in one block has an corollary in all other blocks.
 
  Register getEquivalentRegisterIn(Register Reg, MachineBasicBlock *BB);
 
  /// Change all users of MI, if MI is predicated out
 
  /// (LiveStages[MI->getParent()] == false).
 
  void rewriteUsesOf(MachineInstr *MI);
 
  /// Insert branches between prologs, kernel and epilogs.
 
  void fixupBranches();
 
  /// Create a poor-man's LCSSA by cloning only the PHIs from the kernel block
 
  /// to a block dominated by all prologs and epilogs. This allows us to treat
 
  /// the loop exiting block as any other kernel clone.
 
  MachineBasicBlock *CreateLCSSAExitingBlock();
 
  /// Helper to get the stage of an instruction in the schedule.
 
  unsigned getStage(MachineInstr *MI) {
 
    if (CanonicalMIs.count(MI))
 
      MI = CanonicalMIs[MI];
 
    return Schedule.getStage(MI);
 
  }
 
  /// Helper function to find the right canonical register for a phi instruction
 
  /// coming from a peeled out prologue.
 
  Register getPhiCanonicalReg(MachineInstr* CanonicalPhi, MachineInstr* Phi);
 
  /// Target loop info before kernel peeling.
 
  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> LoopInfo;
 
};
 
 
 
/// Expander that simply annotates each scheduled instruction with a post-instr
 
/// symbol that can be consumed by the ModuloScheduleTest pass.
 
///
 
/// The post-instr symbol is a way of annotating an instruction that can be
 
/// roundtripped in MIR. The syntax is:
 
///   MYINST %0, post-instr-symbol <mcsymbol Stage-1_Cycle-5>
 
class ModuloScheduleTestAnnotater {
 
  MachineFunction &MF;
 
  ModuloSchedule &S;
 
 
 
public:
 
  ModuloScheduleTestAnnotater(MachineFunction &MF, ModuloSchedule &S)
 
      : MF(MF), S(S) {}
 
 
 
  /// Performs the annotation.
 
  void annotate();
 
};
 
 
 
} // end namespace llvm
 
 
 
#endif // LLVM_CODEGEN_MODULOSCHEDULE_H