Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===------------------------- LSUnit.h --------------------------*- C++-*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. /// \file
  9. ///
  10. /// A Load/Store unit class that models load/store queues and that implements
  11. /// a simple weak memory consistency model.
  12. ///
  13. //===----------------------------------------------------------------------===//
  14.  
  15. #ifndef LLVM_MCA_HARDWAREUNITS_LSUNIT_H
  16. #define LLVM_MCA_HARDWAREUNITS_LSUNIT_H
  17.  
  18. #include "llvm/ADT/DenseMap.h"
  19. #include "llvm/ADT/SmallVector.h"
  20. #include "llvm/MC/MCSchedule.h"
  21. #include "llvm/MCA/HardwareUnits/HardwareUnit.h"
  22. #include "llvm/MCA/Instruction.h"
  23.  
  24. namespace llvm {
  25. namespace mca {
  26.  
  27. /// A node of a memory dependency graph. A MemoryGroup describes a set of
  28. /// instructions with same memory dependencies.
  29. ///
  30. /// By construction, instructions of a MemoryGroup don't depend on each other.
  31. /// At dispatch stage, instructions are mapped by the LSUnit to MemoryGroups.
  32. /// A Memory group identifier is then stored as a "token" in field
  33. /// Instruction::LSUTokenID of each dispatched instructions. That token is used
  34. /// internally by the LSUnit to track memory dependencies.
  35. class MemoryGroup {
  36.   unsigned NumPredecessors;
  37.   unsigned NumExecutingPredecessors;
  38.   unsigned NumExecutedPredecessors;
  39.  
  40.   unsigned NumInstructions;
  41.   unsigned NumExecuting;
  42.   unsigned NumExecuted;
  43.   // Successors that are in a order dependency with this group.
  44.   SmallVector<MemoryGroup *, 4> OrderSucc;
  45.   // Successors that are in a data dependency with this group.
  46.   SmallVector<MemoryGroup *, 4> DataSucc;
  47.  
  48.   CriticalDependency CriticalPredecessor;
  49.   InstRef CriticalMemoryInstruction;
  50.  
  51.   MemoryGroup(const MemoryGroup &) = delete;
  52.   MemoryGroup &operator=(const MemoryGroup &) = delete;
  53.  
  54. public:
  55.   MemoryGroup()
  56.       : NumPredecessors(0), NumExecutingPredecessors(0),
  57.         NumExecutedPredecessors(0), NumInstructions(0), NumExecuting(0),
  58.         NumExecuted(0), CriticalPredecessor() {}
  59.   MemoryGroup(MemoryGroup &&) = default;
  60.  
  61.   size_t getNumSuccessors() const {
  62.     return OrderSucc.size() + DataSucc.size();
  63.   }
  64.   unsigned getNumPredecessors() const { return NumPredecessors; }
  65.   unsigned getNumExecutingPredecessors() const {
  66.     return NumExecutingPredecessors;
  67.   }
  68.   unsigned getNumExecutedPredecessors() const {
  69.     return NumExecutedPredecessors;
  70.   }
  71.   unsigned getNumInstructions() const { return NumInstructions; }
  72.   unsigned getNumExecuting() const { return NumExecuting; }
  73.   unsigned getNumExecuted() const { return NumExecuted; }
  74.  
  75.   const InstRef &getCriticalMemoryInstruction() const {
  76.     return CriticalMemoryInstruction;
  77.   }
  78.   const CriticalDependency &getCriticalPredecessor() const {
  79.     return CriticalPredecessor;
  80.   }
  81.  
  82.   void addSuccessor(MemoryGroup *Group, bool IsDataDependent) {
  83.     // Do not need to add a dependency if there is no data
  84.     // dependency and all instructions from this group have been
  85.     // issued already.
  86.     if (!IsDataDependent && isExecuting())
  87.       return;
  88.  
  89.     Group->NumPredecessors++;
  90.     assert(!isExecuted() && "Should have been removed!");
  91.     if (isExecuting())
  92.       Group->onGroupIssued(CriticalMemoryInstruction, IsDataDependent);
  93.  
  94.     if (IsDataDependent)
  95.       DataSucc.emplace_back(Group);
  96.     else
  97.       OrderSucc.emplace_back(Group);
  98.   }
  99.  
  100.   bool isWaiting() const {
  101.     return NumPredecessors >
  102.            (NumExecutingPredecessors + NumExecutedPredecessors);
  103.   }
  104.   bool isPending() const {
  105.     return NumExecutingPredecessors &&
  106.            ((NumExecutedPredecessors + NumExecutingPredecessors) ==
  107.             NumPredecessors);
  108.   }
  109.   bool isReady() const { return NumExecutedPredecessors == NumPredecessors; }
  110.   bool isExecuting() const {
  111.     return NumExecuting && (NumExecuting == (NumInstructions - NumExecuted));
  112.   }
  113.   bool isExecuted() const { return NumInstructions == NumExecuted; }
  114.  
  115.   void onGroupIssued(const InstRef &IR, bool ShouldUpdateCriticalDep) {
  116.     assert(!isReady() && "Unexpected group-start event!");
  117.     NumExecutingPredecessors++;
  118.  
  119.     if (!ShouldUpdateCriticalDep)
  120.       return;
  121.  
  122.     unsigned Cycles = IR.getInstruction()->getCyclesLeft();
  123.     if (CriticalPredecessor.Cycles < Cycles) {
  124.       CriticalPredecessor.IID = IR.getSourceIndex();
  125.       CriticalPredecessor.Cycles = Cycles;
  126.     }
  127.   }
  128.  
  129.   void onGroupExecuted() {
  130.     assert(!isReady() && "Inconsistent state found!");
  131.     NumExecutingPredecessors--;
  132.     NumExecutedPredecessors++;
  133.   }
  134.  
  135.   void onInstructionIssued(const InstRef &IR) {
  136.     assert(!isExecuting() && "Invalid internal state!");
  137.     ++NumExecuting;
  138.  
  139.     // update the CriticalMemDep.
  140.     const Instruction &IS = *IR.getInstruction();
  141.     if ((bool)CriticalMemoryInstruction) {
  142.       const Instruction &OtherIS = *CriticalMemoryInstruction.getInstruction();
  143.       if (OtherIS.getCyclesLeft() < IS.getCyclesLeft())
  144.         CriticalMemoryInstruction = IR;
  145.     } else {
  146.       CriticalMemoryInstruction = IR;
  147.     }
  148.  
  149.     if (!isExecuting())
  150.       return;
  151.  
  152.     // Notify successors that this group started execution.
  153.     for (MemoryGroup *MG : OrderSucc) {
  154.       MG->onGroupIssued(CriticalMemoryInstruction, false);
  155.       // Release the order dependency with this group.
  156.       MG->onGroupExecuted();
  157.     }
  158.  
  159.     for (MemoryGroup *MG : DataSucc)
  160.       MG->onGroupIssued(CriticalMemoryInstruction, true);
  161.   }
  162.  
  163.   void onInstructionExecuted(const InstRef &IR) {
  164.     assert(isReady() && !isExecuted() && "Invalid internal state!");
  165.     --NumExecuting;
  166.     ++NumExecuted;
  167.  
  168.     if (CriticalMemoryInstruction &&
  169.         CriticalMemoryInstruction.getSourceIndex() == IR.getSourceIndex()) {
  170.       CriticalMemoryInstruction.invalidate();
  171.     }
  172.  
  173.     if (!isExecuted())
  174.       return;
  175.  
  176.     // Notify data dependent successors that this group has finished execution.
  177.     for (MemoryGroup *MG : DataSucc)
  178.       MG->onGroupExecuted();
  179.   }
  180.  
  181.   void addInstruction() {
  182.     assert(!getNumSuccessors() && "Cannot add instructions to this group!");
  183.     ++NumInstructions;
  184.   }
  185.  
  186.   void cycleEvent() {
  187.     if (isWaiting() && CriticalPredecessor.Cycles)
  188.       CriticalPredecessor.Cycles--;
  189.   }
  190. };
  191.  
  192. /// Abstract base interface for LS (load/store) units in llvm-mca.
  193. class LSUnitBase : public HardwareUnit {
  194.   /// Load queue size.
  195.   ///
  196.   /// A value of zero for this field means that the load queue is unbounded.
  197.   /// Processor models can declare the size of a load queue via tablegen (see
  198.   /// the definition of tablegen class LoadQueue in
  199.   /// llvm/Target/TargetSchedule.td).
  200.   unsigned LQSize;
  201.  
  202.   /// Load queue size.
  203.   ///
  204.   /// A value of zero for this field means that the store queue is unbounded.
  205.   /// Processor models can declare the size of a store queue via tablegen (see
  206.   /// the definition of tablegen class StoreQueue in
  207.   /// llvm/Target/TargetSchedule.td).
  208.   unsigned SQSize;
  209.  
  210.   unsigned UsedLQEntries;
  211.   unsigned UsedSQEntries;
  212.  
  213.   /// True if loads don't alias with stores.
  214.   ///
  215.   /// By default, the LS unit assumes that loads and stores don't alias with
  216.   /// eachother. If this field is set to false, then loads are always assumed to
  217.   /// alias with stores.
  218.   const bool NoAlias;
  219.  
  220.   /// Used to map group identifiers to MemoryGroups.
  221.   DenseMap<unsigned, std::unique_ptr<MemoryGroup>> Groups;
  222.   unsigned NextGroupID;
  223.  
  224. public:
  225.   LSUnitBase(const MCSchedModel &SM, unsigned LoadQueueSize,
  226.              unsigned StoreQueueSize, bool AssumeNoAlias);
  227.  
  228.   virtual ~LSUnitBase();
  229.  
  230.   /// Returns the total number of entries in the load queue.
  231.   unsigned getLoadQueueSize() const { return LQSize; }
  232.  
  233.   /// Returns the total number of entries in the store queue.
  234.   unsigned getStoreQueueSize() const { return SQSize; }
  235.  
  236.   unsigned getUsedLQEntries() const { return UsedLQEntries; }
  237.   unsigned getUsedSQEntries() const { return UsedSQEntries; }
  238.   void acquireLQSlot() { ++UsedLQEntries; }
  239.   void acquireSQSlot() { ++UsedSQEntries; }
  240.   void releaseLQSlot() { --UsedLQEntries; }
  241.   void releaseSQSlot() { --UsedSQEntries; }
  242.  
  243.   bool assumeNoAlias() const { return NoAlias; }
  244.  
  245.   enum Status {
  246.     LSU_AVAILABLE = 0,
  247.     LSU_LQUEUE_FULL, // Load Queue unavailable
  248.     LSU_SQUEUE_FULL  // Store Queue unavailable
  249.   };
  250.  
  251.   /// This method checks the availability of the load/store buffers.
  252.   ///
  253.   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
  254.   /// accomodate instruction IR. By default, LSU_AVAILABLE is returned if IR is
  255.   /// not a memory operation.
  256.   virtual Status isAvailable(const InstRef &IR) const = 0;
  257.  
  258.   /// Allocates LS resources for instruction IR.
  259.   ///
  260.   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
  261.   /// with a LSUnitBase::Status value of LSU_AVAILABLE.
  262.   /// Returns the GroupID associated with this instruction. That value will be
  263.   /// used to set the LSUTokenID field in class Instruction.
  264.   virtual unsigned dispatch(const InstRef &IR) = 0;
  265.  
  266.   bool isSQEmpty() const { return !UsedSQEntries; }
  267.   bool isLQEmpty() const { return !UsedLQEntries; }
  268.   bool isSQFull() const { return SQSize && SQSize == UsedSQEntries; }
  269.   bool isLQFull() const { return LQSize && LQSize == UsedLQEntries; }
  270.  
  271.   bool isValidGroupID(unsigned Index) const {
  272.     return Index && (Groups.find(Index) != Groups.end());
  273.   }
  274.  
  275.   /// Check if a peviously dispatched instruction IR is now ready for execution.
  276.   bool isReady(const InstRef &IR) const {
  277.     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  278.     const MemoryGroup &Group = getGroup(GroupID);
  279.     return Group.isReady();
  280.   }
  281.  
  282.   /// Check if instruction IR only depends on memory instructions that are
  283.   /// currently executing.
  284.   bool isPending(const InstRef &IR) const {
  285.     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  286.     const MemoryGroup &Group = getGroup(GroupID);
  287.     return Group.isPending();
  288.   }
  289.  
  290.   /// Check if instruction IR is still waiting on memory operations, and the
  291.   /// wait time is still unknown.
  292.   bool isWaiting(const InstRef &IR) const {
  293.     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  294.     const MemoryGroup &Group = getGroup(GroupID);
  295.     return Group.isWaiting();
  296.   }
  297.  
  298.   bool hasDependentUsers(const InstRef &IR) const {
  299.     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  300.     const MemoryGroup &Group = getGroup(GroupID);
  301.     return !Group.isExecuted() && Group.getNumSuccessors();
  302.   }
  303.  
  304.   const MemoryGroup &getGroup(unsigned Index) const {
  305.     assert(isValidGroupID(Index) && "Group doesn't exist!");
  306.     return *Groups.find(Index)->second;
  307.   }
  308.  
  309.   MemoryGroup &getGroup(unsigned Index) {
  310.     assert(isValidGroupID(Index) && "Group doesn't exist!");
  311.     return *Groups.find(Index)->second;
  312.   }
  313.  
  314.   unsigned createMemoryGroup() {
  315.     Groups.insert(
  316.         std::make_pair(NextGroupID, std::make_unique<MemoryGroup>()));
  317.     return NextGroupID++;
  318.   }
  319.  
  320.   virtual void onInstructionExecuted(const InstRef &IR);
  321.  
  322.   // Loads are tracked by the LDQ (load queue) from dispatch until completion.
  323.   // Stores are tracked by the STQ (store queue) from dispatch until commitment.
  324.   // By default we conservatively assume that the LDQ receives a load at
  325.   // dispatch. Loads leave the LDQ at retirement stage.
  326.   virtual void onInstructionRetired(const InstRef &IR);
  327.  
  328.   virtual void onInstructionIssued(const InstRef &IR) {
  329.     unsigned GroupID = IR.getInstruction()->getLSUTokenID();
  330.     Groups[GroupID]->onInstructionIssued(IR);
  331.   }
  332.  
  333.   virtual void cycleEvent();
  334.  
  335. #ifndef NDEBUG
  336.   void dump() const;
  337. #endif
  338. };
  339.  
  340. /// Default Load/Store Unit (LS Unit) for simulated processors.
  341. ///
  342. /// Each load (or store) consumes one entry in the load (or store) queue.
  343. ///
  344. /// Rules are:
  345. /// 1) A younger load is allowed to pass an older load only if there are no
  346. ///    stores nor barriers in between the two loads.
  347. /// 2) An younger store is not allowed to pass an older store.
  348. /// 3) A younger store is not allowed to pass an older load.
  349. /// 4) A younger load is allowed to pass an older store only if the load does
  350. ///    not alias with the store.
  351. ///
  352. /// This class optimistically assumes that loads don't alias store operations.
  353. /// Under this assumption, younger loads are always allowed to pass older
  354. /// stores (this would only affects rule 4).
  355. /// Essentially, this class doesn't perform any sort alias analysis to
  356. /// identify aliasing loads and stores.
  357. ///
  358. /// To enforce aliasing between loads and stores, flag `AssumeNoAlias` must be
  359. /// set to `false` by the constructor of LSUnit.
  360. ///
  361. /// Note that this class doesn't know about the existence of different memory
  362. /// types for memory operations (example: write-through, write-combining, etc.).
  363. /// Derived classes are responsible for implementing that extra knowledge, and
  364. /// provide different sets of rules for loads and stores by overriding method
  365. /// `isReady()`.
  366. /// To emulate a write-combining memory type, rule 2. must be relaxed in a
  367. /// derived class to enable the reordering of non-aliasing store operations.
  368. ///
  369. /// No assumptions are made by this class on the size of the store buffer.  This
  370. /// class doesn't know how to identify cases where store-to-load forwarding may
  371. /// occur.
  372. ///
  373. /// LSUnit doesn't attempt to predict whether a load or store hits or misses
  374. /// the L1 cache. To be more specific, LSUnit doesn't know anything about
  375. /// cache hierarchy and memory types.
  376. /// It only knows if an instruction "mayLoad" and/or "mayStore". For loads, the
  377. /// scheduling model provides an "optimistic" load-to-use latency (which usually
  378. /// matches the load-to-use latency for when there is a hit in the L1D).
  379. /// Derived classes may expand this knowledge.
  380. ///
  381. /// Class MCInstrDesc in LLVM doesn't know about serializing operations, nor
  382. /// memory-barrier like instructions.
  383. /// LSUnit conservatively assumes that an instruction which `mayLoad` and has
  384. /// `unmodeled side effects` behave like a "soft" load-barrier. That means, it
  385. /// serializes loads without forcing a flush of the load queue.
  386. /// Similarly, instructions that both `mayStore` and have `unmodeled side
  387. /// effects` are treated like store barriers. A full memory
  388. /// barrier is a 'mayLoad' and 'mayStore' instruction with unmodeled side
  389. /// effects. This is obviously inaccurate, but this is the best that we can do
  390. /// at the moment.
  391. ///
  392. /// Each load/store barrier consumes one entry in the load/store queue. A
  393. /// load/store barrier enforces ordering of loads/stores:
  394. ///  - A younger load cannot pass a load barrier.
  395. ///  - A younger store cannot pass a store barrier.
  396. ///
  397. /// A younger load has to wait for the memory load barrier to execute.
  398. /// A load/store barrier is "executed" when it becomes the oldest entry in
  399. /// the load/store queue(s). That also means, all the older loads/stores have
  400. /// already been executed.
  401. class LSUnit : public LSUnitBase {
  402.   // This class doesn't know about the latency of a load instruction. So, it
  403.   // conservatively/pessimistically assumes that the latency of a load opcode
  404.   // matches the instruction latency.
  405.   //
  406.   // FIXME: In the absence of cache misses (i.e. L1I/L1D/iTLB/dTLB hits/misses),
  407.   // and load/store conflicts, the latency of a load is determined by the depth
  408.   // of the load pipeline. So, we could use field `LoadLatency` in the
  409.   // MCSchedModel to model that latency.
  410.   // Field `LoadLatency` often matches the so-called 'load-to-use' latency from
  411.   // L1D, and it usually already accounts for any extra latency due to data
  412.   // forwarding.
  413.   // When doing throughput analysis, `LoadLatency` is likely to
  414.   // be a better predictor of load latency than instruction latency. This is
  415.   // particularly true when simulating code with temporal/spatial locality of
  416.   // memory accesses.
  417.   // Using `LoadLatency` (instead of the instruction latency) is also expected
  418.   // to improve the load queue allocation for long latency instructions with
  419.   // folded memory operands (See PR39829).
  420.   //
  421.   // FIXME: On some processors, load/store operations are split into multiple
  422.   // uOps. For example, X86 AMD Jaguar natively supports 128-bit data types, but
  423.   // not 256-bit data types. So, a 256-bit load is effectively split into two
  424.   // 128-bit loads, and each split load consumes one 'LoadQueue' entry. For
  425.   // simplicity, this class optimistically assumes that a load instruction only
  426.   // consumes one entry in the LoadQueue.  Similarly, store instructions only
  427.   // consume a single entry in the StoreQueue.
  428.   // In future, we should reassess the quality of this design, and consider
  429.   // alternative approaches that let instructions specify the number of
  430.   // load/store queue entries which they consume at dispatch stage (See
  431.   // PR39830).
  432.   //
  433.   // An instruction that both 'mayStore' and 'HasUnmodeledSideEffects' is
  434.   // conservatively treated as a store barrier. It forces older store to be
  435.   // executed before newer stores are issued.
  436.   //
  437.   // An instruction that both 'MayLoad' and 'HasUnmodeledSideEffects' is
  438.   // conservatively treated as a load barrier. It forces older loads to execute
  439.   // before newer loads are issued.
  440.   unsigned CurrentLoadGroupID;
  441.   unsigned CurrentLoadBarrierGroupID;
  442.   unsigned CurrentStoreGroupID;
  443.   unsigned CurrentStoreBarrierGroupID;
  444.  
  445. public:
  446.   LSUnit(const MCSchedModel &SM)
  447.       : LSUnit(SM, /* LQSize */ 0, /* SQSize */ 0, /* NoAlias */ false) {}
  448.   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ)
  449.       : LSUnit(SM, LQ, SQ, /* NoAlias */ false) {}
  450.   LSUnit(const MCSchedModel &SM, unsigned LQ, unsigned SQ, bool AssumeNoAlias)
  451.       : LSUnitBase(SM, LQ, SQ, AssumeNoAlias), CurrentLoadGroupID(0),
  452.         CurrentLoadBarrierGroupID(0), CurrentStoreGroupID(0),
  453.         CurrentStoreBarrierGroupID(0) {}
  454.  
  455.   /// Returns LSU_AVAILABLE if there are enough load/store queue entries to
  456.   /// accomodate instruction IR.
  457.   Status isAvailable(const InstRef &IR) const override;
  458.  
  459.   /// Allocates LS resources for instruction IR.
  460.   ///
  461.   /// This method assumes that a previous call to `isAvailable(IR)` succeeded
  462.   /// returning LSU_AVAILABLE.
  463.   ///
  464.   /// Rules are:
  465.   /// By default, rules are:
  466.   /// 1. A store may not pass a previous store.
  467.   /// 2. A load may not pass a previous store unless flag 'NoAlias' is set.
  468.   /// 3. A load may pass a previous load.
  469.   /// 4. A store may not pass a previous load (regardless of flag 'NoAlias').
  470.   /// 5. A load has to wait until an older load barrier is fully executed.
  471.   /// 6. A store has to wait until an older store barrier is fully executed.
  472.   unsigned dispatch(const InstRef &IR) override;
  473.  
  474.   void onInstructionExecuted(const InstRef &IR) override;
  475. };
  476.  
  477. } // namespace mca
  478. } // namespace llvm
  479.  
  480. #endif // LLVM_MCA_HARDWAREUNITS_LSUNIT_H
  481.