Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //==- BlockFrequencyInfoImpl.h - Block Frequency Implementation --*- C++ -*-==//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // Shared implementation of BlockFrequency for IR and Machine Instructions.
  10. // See the documentation below for BlockFrequencyInfoImpl for details.
  11. //
  12. //===----------------------------------------------------------------------===//
  13.  
  14. #ifndef LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  15. #define LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  16.  
  17. #include "llvm/ADT/BitVector.h"
  18. #include "llvm/ADT/DenseMap.h"
  19. #include "llvm/ADT/DenseSet.h"
  20. #include "llvm/ADT/GraphTraits.h"
  21. #include "llvm/ADT/PostOrderIterator.h"
  22. #include "llvm/ADT/SmallPtrSet.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/SparseBitVector.h"
  25. #include "llvm/ADT/Twine.h"
  26. #include "llvm/ADT/iterator_range.h"
  27. #include "llvm/IR/BasicBlock.h"
  28. #include "llvm/IR/ValueHandle.h"
  29. #include "llvm/Support/BlockFrequency.h"
  30. #include "llvm/Support/BranchProbability.h"
  31. #include "llvm/Support/CommandLine.h"
  32. #include "llvm/Support/DOTGraphTraits.h"
  33. #include "llvm/Support/Debug.h"
  34. #include "llvm/Support/Format.h"
  35. #include "llvm/Support/ScaledNumber.h"
  36. #include "llvm/Support/raw_ostream.h"
  37. #include <algorithm>
  38. #include <cassert>
  39. #include <cstddef>
  40. #include <cstdint>
  41. #include <deque>
  42. #include <iterator>
  43. #include <limits>
  44. #include <list>
  45. #include <optional>
  46. #include <queue>
  47. #include <string>
  48. #include <utility>
  49. #include <vector>
  50.  
  51. #define DEBUG_TYPE "block-freq"
  52.  
  53. namespace llvm {
  54. extern llvm::cl::opt<bool> CheckBFIUnknownBlockQueries;
  55.  
  56. extern llvm::cl::opt<bool> UseIterativeBFIInference;
  57. extern llvm::cl::opt<unsigned> IterativeBFIMaxIterationsPerBlock;
  58. extern llvm::cl::opt<double> IterativeBFIPrecision;
  59.  
  60. class BranchProbabilityInfo;
  61. class Function;
  62. class Loop;
  63. class LoopInfo;
  64. class MachineBasicBlock;
  65. class MachineBranchProbabilityInfo;
  66. class MachineFunction;
  67. class MachineLoop;
  68. class MachineLoopInfo;
  69.  
  70. namespace bfi_detail {
  71.  
  72. struct IrreducibleGraph;
  73.  
  74. // This is part of a workaround for a GCC 4.7 crash on lambdas.
  75. template <class BT> struct BlockEdgesAdder;
  76.  
  77. /// Mass of a block.
  78. ///
  79. /// This class implements a sort of fixed-point fraction always between 0.0 and
  80. /// 1.0.  getMass() == std::numeric_limits<uint64_t>::max() indicates a value of
  81. /// 1.0.
  82. ///
  83. /// Masses can be added and subtracted.  Simple saturation arithmetic is used,
  84. /// so arithmetic operations never overflow or underflow.
  85. ///
  86. /// Masses can be multiplied.  Multiplication treats full mass as 1.0 and uses
  87. /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
  88. /// quite, maximum precision).
  89. ///
  90. /// Masses can be scaled by \a BranchProbability at maximum precision.
  91. class BlockMass {
  92.   uint64_t Mass = 0;
  93.  
  94. public:
  95.   BlockMass() = default;
  96.   explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
  97.  
  98.   static BlockMass getEmpty() { return BlockMass(); }
  99.  
  100.   static BlockMass getFull() {
  101.     return BlockMass(std::numeric_limits<uint64_t>::max());
  102.   }
  103.  
  104.   uint64_t getMass() const { return Mass; }
  105.  
  106.   bool isFull() const { return Mass == std::numeric_limits<uint64_t>::max(); }
  107.   bool isEmpty() const { return !Mass; }
  108.  
  109.   bool operator!() const { return isEmpty(); }
  110.  
  111.   /// Add another mass.
  112.   ///
  113.   /// Adds another mass, saturating at \a isFull() rather than overflowing.
  114.   BlockMass &operator+=(BlockMass X) {
  115.     uint64_t Sum = Mass + X.Mass;
  116.     Mass = Sum < Mass ? std::numeric_limits<uint64_t>::max() : Sum;
  117.     return *this;
  118.   }
  119.  
  120.   /// Subtract another mass.
  121.   ///
  122.   /// Subtracts another mass, saturating at \a isEmpty() rather than
  123.   /// undeflowing.
  124.   BlockMass &operator-=(BlockMass X) {
  125.     uint64_t Diff = Mass - X.Mass;
  126.     Mass = Diff > Mass ? 0 : Diff;
  127.     return *this;
  128.   }
  129.  
  130.   BlockMass &operator*=(BranchProbability P) {
  131.     Mass = P.scale(Mass);
  132.     return *this;
  133.   }
  134.  
  135.   bool operator==(BlockMass X) const { return Mass == X.Mass; }
  136.   bool operator!=(BlockMass X) const { return Mass != X.Mass; }
  137.   bool operator<=(BlockMass X) const { return Mass <= X.Mass; }
  138.   bool operator>=(BlockMass X) const { return Mass >= X.Mass; }
  139.   bool operator<(BlockMass X) const { return Mass < X.Mass; }
  140.   bool operator>(BlockMass X) const { return Mass > X.Mass; }
  141.  
  142.   /// Convert to scaled number.
  143.   ///
  144.   /// Convert to \a ScaledNumber.  \a isFull() gives 1.0, while \a isEmpty()
  145.   /// gives slightly above 0.0.
  146.   ScaledNumber<uint64_t> toScaled() const;
  147.  
  148.   void dump() const;
  149.   raw_ostream &print(raw_ostream &OS) const;
  150. };
  151.  
  152. inline BlockMass operator+(BlockMass L, BlockMass R) {
  153.   return BlockMass(L) += R;
  154. }
  155. inline BlockMass operator-(BlockMass L, BlockMass R) {
  156.   return BlockMass(L) -= R;
  157. }
  158. inline BlockMass operator*(BlockMass L, BranchProbability R) {
  159.   return BlockMass(L) *= R;
  160. }
  161. inline BlockMass operator*(BranchProbability L, BlockMass R) {
  162.   return BlockMass(R) *= L;
  163. }
  164.  
  165. inline raw_ostream &operator<<(raw_ostream &OS, BlockMass X) {
  166.   return X.print(OS);
  167. }
  168.  
  169. } // end namespace bfi_detail
  170.  
  171. /// Base class for BlockFrequencyInfoImpl
  172. ///
  173. /// BlockFrequencyInfoImplBase has supporting data structures and some
  174. /// algorithms for BlockFrequencyInfoImplBase.  Only algorithms that depend on
  175. /// the block type (or that call such algorithms) are skipped here.
  176. ///
  177. /// Nevertheless, the majority of the overall algorithm documentation lives with
  178. /// BlockFrequencyInfoImpl.  See there for details.
  179. class BlockFrequencyInfoImplBase {
  180. public:
  181.   using Scaled64 = ScaledNumber<uint64_t>;
  182.   using BlockMass = bfi_detail::BlockMass;
  183.  
  184.   /// Representative of a block.
  185.   ///
  186.   /// This is a simple wrapper around an index into the reverse-post-order
  187.   /// traversal of the blocks.
  188.   ///
  189.   /// Unlike a block pointer, its order has meaning (location in the
  190.   /// topological sort) and it's class is the same regardless of block type.
  191.   struct BlockNode {
  192.     using IndexType = uint32_t;
  193.  
  194.     IndexType Index;
  195.  
  196.     BlockNode() : Index(std::numeric_limits<uint32_t>::max()) {}
  197.     BlockNode(IndexType Index) : Index(Index) {}
  198.  
  199.     bool operator==(const BlockNode &X) const { return Index == X.Index; }
  200.     bool operator!=(const BlockNode &X) const { return Index != X.Index; }
  201.     bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
  202.     bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
  203.     bool operator<(const BlockNode &X) const { return Index < X.Index; }
  204.     bool operator>(const BlockNode &X) const { return Index > X.Index; }
  205.  
  206.     bool isValid() const { return Index <= getMaxIndex(); }
  207.  
  208.     static size_t getMaxIndex() {
  209.        return std::numeric_limits<uint32_t>::max() - 1;
  210.     }
  211.   };
  212.  
  213.   /// Stats about a block itself.
  214.   struct FrequencyData {
  215.     Scaled64 Scaled;
  216.     uint64_t Integer;
  217.   };
  218.  
  219.   /// Data about a loop.
  220.   ///
  221.   /// Contains the data necessary to represent a loop as a pseudo-node once it's
  222.   /// packaged.
  223.   struct LoopData {
  224.     using ExitMap = SmallVector<std::pair<BlockNode, BlockMass>, 4>;
  225.     using NodeList = SmallVector<BlockNode, 4>;
  226.     using HeaderMassList = SmallVector<BlockMass, 1>;
  227.  
  228.     LoopData *Parent;            ///< The parent loop.
  229.     bool IsPackaged = false;     ///< Whether this has been packaged.
  230.     uint32_t NumHeaders = 1;     ///< Number of headers.
  231.     ExitMap Exits;               ///< Successor edges (and weights).
  232.     NodeList Nodes;              ///< Header and the members of the loop.
  233.     HeaderMassList BackedgeMass; ///< Mass returned to each loop header.
  234.     BlockMass Mass;
  235.     Scaled64 Scale;
  236.  
  237.     LoopData(LoopData *Parent, const BlockNode &Header)
  238.       : Parent(Parent), Nodes(1, Header), BackedgeMass(1) {}
  239.  
  240.     template <class It1, class It2>
  241.     LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther,
  242.              It2 LastOther)
  243.         : Parent(Parent), Nodes(FirstHeader, LastHeader) {
  244.       NumHeaders = Nodes.size();
  245.       Nodes.insert(Nodes.end(), FirstOther, LastOther);
  246.       BackedgeMass.resize(NumHeaders);
  247.     }
  248.  
  249.     bool isHeader(const BlockNode &Node) const {
  250.       if (isIrreducible())
  251.         return std::binary_search(Nodes.begin(), Nodes.begin() + NumHeaders,
  252.                                   Node);
  253.       return Node == Nodes[0];
  254.     }
  255.  
  256.     BlockNode getHeader() const { return Nodes[0]; }
  257.     bool isIrreducible() const { return NumHeaders > 1; }
  258.  
  259.     HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) {
  260.       assert(isHeader(B) && "this is only valid on loop header blocks");
  261.       if (isIrreducible())
  262.         return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) -
  263.                Nodes.begin();
  264.       return 0;
  265.     }
  266.  
  267.     NodeList::const_iterator members_begin() const {
  268.       return Nodes.begin() + NumHeaders;
  269.     }
  270.  
  271.     NodeList::const_iterator members_end() const { return Nodes.end(); }
  272.     iterator_range<NodeList::const_iterator> members() const {
  273.       return make_range(members_begin(), members_end());
  274.     }
  275.   };
  276.  
  277.   /// Index of loop information.
  278.   struct WorkingData {
  279.     BlockNode Node;           ///< This node.
  280.     LoopData *Loop = nullptr; ///< The loop this block is inside.
  281.     BlockMass Mass;           ///< Mass distribution from the entry block.
  282.  
  283.     WorkingData(const BlockNode &Node) : Node(Node) {}
  284.  
  285.     bool isLoopHeader() const { return Loop && Loop->isHeader(Node); }
  286.  
  287.     bool isDoubleLoopHeader() const {
  288.       return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() &&
  289.              Loop->Parent->isHeader(Node);
  290.     }
  291.  
  292.     LoopData *getContainingLoop() const {
  293.       if (!isLoopHeader())
  294.         return Loop;
  295.       if (!isDoubleLoopHeader())
  296.         return Loop->Parent;
  297.       return Loop->Parent->Parent;
  298.     }
  299.  
  300.     /// Resolve a node to its representative.
  301.     ///
  302.     /// Get the node currently representing Node, which could be a containing
  303.     /// loop.
  304.     ///
  305.     /// This function should only be called when distributing mass.  As long as
  306.     /// there are no irreducible edges to Node, then it will have complexity
  307.     /// O(1) in this context.
  308.     ///
  309.     /// In general, the complexity is O(L), where L is the number of loop
  310.     /// headers Node has been packaged into.  Since this method is called in
  311.     /// the context of distributing mass, L will be the number of loop headers
  312.     /// an early exit edge jumps out of.
  313.     BlockNode getResolvedNode() const {
  314.       auto L = getPackagedLoop();
  315.       return L ? L->getHeader() : Node;
  316.     }
  317.  
  318.     LoopData *getPackagedLoop() const {
  319.       if (!Loop || !Loop->IsPackaged)
  320.         return nullptr;
  321.       auto L = Loop;
  322.       while (L->Parent && L->Parent->IsPackaged)
  323.         L = L->Parent;
  324.       return L;
  325.     }
  326.  
  327.     /// Get the appropriate mass for a node.
  328.     ///
  329.     /// Get appropriate mass for Node.  If Node is a loop-header (whose loop
  330.     /// has been packaged), returns the mass of its pseudo-node.  If it's a
  331.     /// node inside a packaged loop, it returns the loop's mass.
  332.     BlockMass &getMass() {
  333.       if (!isAPackage())
  334.         return Mass;
  335.       if (!isADoublePackage())
  336.         return Loop->Mass;
  337.       return Loop->Parent->Mass;
  338.     }
  339.  
  340.     /// Has ContainingLoop been packaged up?
  341.     bool isPackaged() const { return getResolvedNode() != Node; }
  342.  
  343.     /// Has Loop been packaged up?
  344.     bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; }
  345.  
  346.     /// Has Loop been packaged up twice?
  347.     bool isADoublePackage() const {
  348.       return isDoubleLoopHeader() && Loop->Parent->IsPackaged;
  349.     }
  350.   };
  351.  
  352.   /// Unscaled probability weight.
  353.   ///
  354.   /// Probability weight for an edge in the graph (including the
  355.   /// successor/target node).
  356.   ///
  357.   /// All edges in the original function are 32-bit.  However, exit edges from
  358.   /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
  359.   /// space in general.
  360.   ///
  361.   /// In addition to the raw weight amount, Weight stores the type of the edge
  362.   /// in the current context (i.e., the context of the loop being processed).
  363.   /// Is this a local edge within the loop, an exit from the loop, or a
  364.   /// backedge to the loop header?
  365.   struct Weight {
  366.     enum DistType { Local, Exit, Backedge };
  367.     DistType Type = Local;
  368.     BlockNode TargetNode;
  369.     uint64_t Amount = 0;
  370.  
  371.     Weight() = default;
  372.     Weight(DistType Type, BlockNode TargetNode, uint64_t Amount)
  373.         : Type(Type), TargetNode(TargetNode), Amount(Amount) {}
  374.   };
  375.  
  376.   /// Distribution of unscaled probability weight.
  377.   ///
  378.   /// Distribution of unscaled probability weight to a set of successors.
  379.   ///
  380.   /// This class collates the successor edge weights for later processing.
  381.   ///
  382.   /// \a DidOverflow indicates whether \a Total did overflow while adding to
  383.   /// the distribution.  It should never overflow twice.
  384.   struct Distribution {
  385.     using WeightList = SmallVector<Weight, 4>;
  386.  
  387.     WeightList Weights;       ///< Individual successor weights.
  388.     uint64_t Total = 0;       ///< Sum of all weights.
  389.     bool DidOverflow = false; ///< Whether \a Total did overflow.
  390.  
  391.     Distribution() = default;
  392.  
  393.     void addLocal(const BlockNode &Node, uint64_t Amount) {
  394.       add(Node, Amount, Weight::Local);
  395.     }
  396.  
  397.     void addExit(const BlockNode &Node, uint64_t Amount) {
  398.       add(Node, Amount, Weight::Exit);
  399.     }
  400.  
  401.     void addBackedge(const BlockNode &Node, uint64_t Amount) {
  402.       add(Node, Amount, Weight::Backedge);
  403.     }
  404.  
  405.     /// Normalize the distribution.
  406.     ///
  407.     /// Combines multiple edges to the same \a Weight::TargetNode and scales
  408.     /// down so that \a Total fits into 32-bits.
  409.     ///
  410.     /// This is linear in the size of \a Weights.  For the vast majority of
  411.     /// cases, adjacent edge weights are combined by sorting WeightList and
  412.     /// combining adjacent weights.  However, for very large edge lists an
  413.     /// auxiliary hash table is used.
  414.     void normalize();
  415.  
  416.   private:
  417.     void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
  418.   };
  419.  
  420.   /// Data about each block.  This is used downstream.
  421.   std::vector<FrequencyData> Freqs;
  422.  
  423.   /// Whether each block is an irreducible loop header.
  424.   /// This is used downstream.
  425.   SparseBitVector<> IsIrrLoopHeader;
  426.  
  427.   /// Loop data: see initializeLoops().
  428.   std::vector<WorkingData> Working;
  429.  
  430.   /// Indexed information about loops.
  431.   std::list<LoopData> Loops;
  432.  
  433.   /// Virtual destructor.
  434.   ///
  435.   /// Need a virtual destructor to mask the compiler warning about
  436.   /// getBlockName().
  437.   virtual ~BlockFrequencyInfoImplBase() = default;
  438.  
  439.   /// Add all edges out of a packaged loop to the distribution.
  440.   ///
  441.   /// Adds all edges from LocalLoopHead to Dist.  Calls addToDist() to add each
  442.   /// successor edge.
  443.   ///
  444.   /// \return \c true unless there's an irreducible backedge.
  445.   bool addLoopSuccessorsToDist(const LoopData *OuterLoop, LoopData &Loop,
  446.                                Distribution &Dist);
  447.  
  448.   /// Add an edge to the distribution.
  449.   ///
  450.   /// Adds an edge to Succ to Dist.  If \c LoopHead.isValid(), then whether the
  451.   /// edge is local/exit/backedge is in the context of LoopHead.  Otherwise,
  452.   /// every edge should be a local edge (since all the loops are packaged up).
  453.   ///
  454.   /// \return \c true unless aborted due to an irreducible backedge.
  455.   bool addToDist(Distribution &Dist, const LoopData *OuterLoop,
  456.                  const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
  457.  
  458.   /// Analyze irreducible SCCs.
  459.   ///
  460.   /// Separate irreducible SCCs from \c G, which is an explicit graph of \c
  461.   /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr).
  462.   /// Insert them into \a Loops before \c Insert.
  463.   ///
  464.   /// \return the \c LoopData nodes representing the irreducible SCCs.
  465.   iterator_range<std::list<LoopData>::iterator>
  466.   analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop,
  467.                      std::list<LoopData>::iterator Insert);
  468.  
  469.   /// Update a loop after packaging irreducible SCCs inside of it.
  470.   ///
  471.   /// Update \c OuterLoop.  Before finding irreducible control flow, it was
  472.   /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a
  473.   /// LoopData::BackedgeMass need to be reset.  Also, nodes that were packaged
  474.   /// up need to be removed from \a OuterLoop::Nodes.
  475.   void updateLoopWithIrreducible(LoopData &OuterLoop);
  476.  
  477.   /// Distribute mass according to a distribution.
  478.   ///
  479.   /// Distributes the mass in Source according to Dist.  If LoopHead.isValid(),
  480.   /// backedges and exits are stored in its entry in Loops.
  481.   ///
  482.   /// Mass is distributed in parallel from two copies of the source mass.
  483.   void distributeMass(const BlockNode &Source, LoopData *OuterLoop,
  484.                       Distribution &Dist);
  485.  
  486.   /// Compute the loop scale for a loop.
  487.   void computeLoopScale(LoopData &Loop);
  488.  
  489.   /// Adjust the mass of all headers in an irreducible loop.
  490.   ///
  491.   /// Initially, irreducible loops are assumed to distribute their mass
  492.   /// equally among its headers. This can lead to wrong frequency estimates
  493.   /// since some headers may be executed more frequently than others.
  494.   ///
  495.   /// This adjusts header mass distribution so it matches the weights of
  496.   /// the backedges going into each of the loop headers.
  497.   void adjustLoopHeaderMass(LoopData &Loop);
  498.  
  499.   void distributeIrrLoopHeaderMass(Distribution &Dist);
  500.  
  501.   /// Package up a loop.
  502.   void packageLoop(LoopData &Loop);
  503.  
  504.   /// Unwrap loops.
  505.   void unwrapLoops();
  506.  
  507.   /// Finalize frequency metrics.
  508.   ///
  509.   /// Calculates final frequencies and cleans up no-longer-needed data
  510.   /// structures.
  511.   void finalizeMetrics();
  512.  
  513.   /// Clear all memory.
  514.   void clear();
  515.  
  516.   virtual std::string getBlockName(const BlockNode &Node) const;
  517.   std::string getLoopName(const LoopData &Loop) const;
  518.  
  519.   virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
  520.   void dump() const { print(dbgs()); }
  521.  
  522.   Scaled64 getFloatingBlockFreq(const BlockNode &Node) const;
  523.  
  524.   BlockFrequency getBlockFreq(const BlockNode &Node) const;
  525.   std::optional<uint64_t>
  526.   getBlockProfileCount(const Function &F, const BlockNode &Node,
  527.                        bool AllowSynthetic = false) const;
  528.   std::optional<uint64_t>
  529.   getProfileCountFromFreq(const Function &F, uint64_t Freq,
  530.                           bool AllowSynthetic = false) const;
  531.   bool isIrrLoopHeader(const BlockNode &Node);
  532.  
  533.   void setBlockFreq(const BlockNode &Node, uint64_t Freq);
  534.  
  535.   raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
  536.   raw_ostream &printBlockFreq(raw_ostream &OS,
  537.                               const BlockFrequency &Freq) const;
  538.  
  539.   uint64_t getEntryFreq() const {
  540.     assert(!Freqs.empty());
  541.     return Freqs[0].Integer;
  542.   }
  543. };
  544.  
  545. namespace bfi_detail {
  546.  
  547. template <class BlockT> struct TypeMap {};
  548. template <> struct TypeMap<BasicBlock> {
  549.   using BlockT = BasicBlock;
  550.   using BlockKeyT = AssertingVH<const BasicBlock>;
  551.   using FunctionT = Function;
  552.   using BranchProbabilityInfoT = BranchProbabilityInfo;
  553.   using LoopT = Loop;
  554.   using LoopInfoT = LoopInfo;
  555. };
  556. template <> struct TypeMap<MachineBasicBlock> {
  557.   using BlockT = MachineBasicBlock;
  558.   using BlockKeyT = const MachineBasicBlock *;
  559.   using FunctionT = MachineFunction;
  560.   using BranchProbabilityInfoT = MachineBranchProbabilityInfo;
  561.   using LoopT = MachineLoop;
  562.   using LoopInfoT = MachineLoopInfo;
  563. };
  564.  
  565. template <class BlockT, class BFIImplT>
  566. class BFICallbackVH;
  567.  
  568. /// Get the name of a MachineBasicBlock.
  569. ///
  570. /// Get the name of a MachineBasicBlock.  It's templated so that including from
  571. /// CodeGen is unnecessary (that would be a layering issue).
  572. ///
  573. /// This is used mainly for debug output.  The name is similar to
  574. /// MachineBasicBlock::getFullName(), but skips the name of the function.
  575. template <class BlockT> std::string getBlockName(const BlockT *BB) {
  576.   assert(BB && "Unexpected nullptr");
  577.   auto MachineName = "BB" + Twine(BB->getNumber());
  578.   if (BB->getBasicBlock())
  579.     return (MachineName + "[" + BB->getName() + "]").str();
  580.   return MachineName.str();
  581. }
  582. /// Get the name of a BasicBlock.
  583. template <> inline std::string getBlockName(const BasicBlock *BB) {
  584.   assert(BB && "Unexpected nullptr");
  585.   return BB->getName().str();
  586. }
  587.  
  588. /// Graph of irreducible control flow.
  589. ///
  590. /// This graph is used for determining the SCCs in a loop (or top-level
  591. /// function) that has irreducible control flow.
  592. ///
  593. /// During the block frequency algorithm, the local graphs are defined in a
  594. /// light-weight way, deferring to the \a BasicBlock or \a MachineBasicBlock
  595. /// graphs for most edges, but getting others from \a LoopData::ExitMap.  The
  596. /// latter only has successor information.
  597. ///
  598. /// \a IrreducibleGraph makes this graph explicit.  It's in a form that can use
  599. /// \a GraphTraits (so that \a analyzeIrreducible() can use \a scc_iterator),
  600. /// and it explicitly lists predecessors and successors.  The initialization
  601. /// that relies on \c MachineBasicBlock is defined in the header.
  602. struct IrreducibleGraph {
  603.   using BFIBase = BlockFrequencyInfoImplBase;
  604.  
  605.   BFIBase &BFI;
  606.  
  607.   using BlockNode = BFIBase::BlockNode;
  608.   struct IrrNode {
  609.     BlockNode Node;
  610.     unsigned NumIn = 0;
  611.     std::deque<const IrrNode *> Edges;
  612.  
  613.     IrrNode(const BlockNode &Node) : Node(Node) {}
  614.  
  615.     using iterator = std::deque<const IrrNode *>::const_iterator;
  616.  
  617.     iterator pred_begin() const { return Edges.begin(); }
  618.     iterator succ_begin() const { return Edges.begin() + NumIn; }
  619.     iterator pred_end() const { return succ_begin(); }
  620.     iterator succ_end() const { return Edges.end(); }
  621.   };
  622.   BlockNode Start;
  623.   const IrrNode *StartIrr = nullptr;
  624.   std::vector<IrrNode> Nodes;
  625.   SmallDenseMap<uint32_t, IrrNode *, 4> Lookup;
  626.  
  627.   /// Construct an explicit graph containing irreducible control flow.
  628.   ///
  629.   /// Construct an explicit graph of the control flow in \c OuterLoop (or the
  630.   /// top-level function, if \c OuterLoop is \c nullptr).  Uses \c
  631.   /// addBlockEdges to add block successors that have not been packaged into
  632.   /// loops.
  633.   ///
  634.   /// \a BlockFrequencyInfoImpl::computeIrreducibleMass() is the only expected
  635.   /// user of this.
  636.   template <class BlockEdgesAdder>
  637.   IrreducibleGraph(BFIBase &BFI, const BFIBase::LoopData *OuterLoop,
  638.                    BlockEdgesAdder addBlockEdges) : BFI(BFI) {
  639.     initialize(OuterLoop, addBlockEdges);
  640.   }
  641.  
  642.   template <class BlockEdgesAdder>
  643.   void initialize(const BFIBase::LoopData *OuterLoop,
  644.                   BlockEdgesAdder addBlockEdges);
  645.   void addNodesInLoop(const BFIBase::LoopData &OuterLoop);
  646.   void addNodesInFunction();
  647.  
  648.   void addNode(const BlockNode &Node) {
  649.     Nodes.emplace_back(Node);
  650.     BFI.Working[Node.Index].getMass() = BlockMass::getEmpty();
  651.   }
  652.  
  653.   void indexNodes();
  654.   template <class BlockEdgesAdder>
  655.   void addEdges(const BlockNode &Node, const BFIBase::LoopData *OuterLoop,
  656.                 BlockEdgesAdder addBlockEdges);
  657.   void addEdge(IrrNode &Irr, const BlockNode &Succ,
  658.                const BFIBase::LoopData *OuterLoop);
  659. };
  660.  
  661. template <class BlockEdgesAdder>
  662. void IrreducibleGraph::initialize(const BFIBase::LoopData *OuterLoop,
  663.                                   BlockEdgesAdder addBlockEdges) {
  664.   if (OuterLoop) {
  665.     addNodesInLoop(*OuterLoop);
  666.     for (auto N : OuterLoop->Nodes)
  667.       addEdges(N, OuterLoop, addBlockEdges);
  668.   } else {
  669.     addNodesInFunction();
  670.     for (uint32_t Index = 0; Index < BFI.Working.size(); ++Index)
  671.       addEdges(Index, OuterLoop, addBlockEdges);
  672.   }
  673.   StartIrr = Lookup[Start.Index];
  674. }
  675.  
  676. template <class BlockEdgesAdder>
  677. void IrreducibleGraph::addEdges(const BlockNode &Node,
  678.                                 const BFIBase::LoopData *OuterLoop,
  679.                                 BlockEdgesAdder addBlockEdges) {
  680.   auto L = Lookup.find(Node.Index);
  681.   if (L == Lookup.end())
  682.     return;
  683.   IrrNode &Irr = *L->second;
  684.   const auto &Working = BFI.Working[Node.Index];
  685.  
  686.   if (Working.isAPackage())
  687.     for (const auto &I : Working.Loop->Exits)
  688.       addEdge(Irr, I.first, OuterLoop);
  689.   else
  690.     addBlockEdges(*this, Irr, OuterLoop);
  691. }
  692.  
  693. } // end namespace bfi_detail
  694.  
  695. /// Shared implementation for block frequency analysis.
  696. ///
  697. /// This is a shared implementation of BlockFrequencyInfo and
  698. /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
  699. /// blocks.
  700. ///
  701. /// LoopInfo defines a loop as a "non-trivial" SCC dominated by a single block,
  702. /// which is called the header.  A given loop, L, can have sub-loops, which are
  703. /// loops within the subgraph of L that exclude its header.  (A "trivial" SCC
  704. /// consists of a single block that does not have a self-edge.)
  705. ///
  706. /// In addition to loops, this algorithm has limited support for irreducible
  707. /// SCCs, which are SCCs with multiple entry blocks.  Irreducible SCCs are
  708. /// discovered on the fly, and modelled as loops with multiple headers.
  709. ///
  710. /// The headers of irreducible sub-SCCs consist of its entry blocks and all
  711. /// nodes that are targets of a backedge within it (excluding backedges within
  712. /// true sub-loops).  Block frequency calculations act as if a block is
  713. /// inserted that intercepts all the edges to the headers.  All backedges and
  714. /// entries point to this block.  Its successors are the headers, which split
  715. /// the frequency evenly.
  716. ///
  717. /// This algorithm leverages BlockMass and ScaledNumber to maintain precision,
  718. /// separates mass distribution from loop scaling, and dithers to eliminate
  719. /// probability mass loss.
  720. ///
  721. /// The implementation is split between BlockFrequencyInfoImpl, which knows the
  722. /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
  723. /// BlockFrequencyInfoImplBase, which doesn't.  The base class uses \a
  724. /// BlockNode, a wrapper around a uint32_t.  BlockNode is numbered from 0 in
  725. /// reverse-post order.  This gives two advantages:  it's easy to compare the
  726. /// relative ordering of two nodes, and maps keyed on BlockT can be represented
  727. /// by vectors.
  728. ///
  729. /// This algorithm is O(V+E), unless there is irreducible control flow, in
  730. /// which case it's O(V*E) in the worst case.
  731. ///
  732. /// These are the main stages:
  733. ///
  734. ///  0. Reverse post-order traversal (\a initializeRPOT()).
  735. ///
  736. ///     Run a single post-order traversal and save it (in reverse) in RPOT.
  737. ///     All other stages make use of this ordering.  Save a lookup from BlockT
  738. ///     to BlockNode (the index into RPOT) in Nodes.
  739. ///
  740. ///  1. Loop initialization (\a initializeLoops()).
  741. ///
  742. ///     Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
  743. ///     the algorithm.  In particular, store the immediate members of each loop
  744. ///     in reverse post-order.
  745. ///
  746. ///  2. Calculate mass and scale in loops (\a computeMassInLoops()).
  747. ///
  748. ///     For each loop (bottom-up), distribute mass through the DAG resulting
  749. ///     from ignoring backedges and treating sub-loops as a single pseudo-node.
  750. ///     Track the backedge mass distributed to the loop header, and use it to
  751. ///     calculate the loop scale (number of loop iterations).  Immediate
  752. ///     members that represent sub-loops will already have been visited and
  753. ///     packaged into a pseudo-node.
  754. ///
  755. ///     Distributing mass in a loop is a reverse-post-order traversal through
  756. ///     the loop.  Start by assigning full mass to the Loop header.  For each
  757. ///     node in the loop:
  758. ///
  759. ///         - Fetch and categorize the weight distribution for its successors.
  760. ///           If this is a packaged-subloop, the weight distribution is stored
  761. ///           in \a LoopData::Exits.  Otherwise, fetch it from
  762. ///           BranchProbabilityInfo.
  763. ///
  764. ///         - Each successor is categorized as \a Weight::Local, a local edge
  765. ///           within the current loop, \a Weight::Backedge, a backedge to the
  766. ///           loop header, or \a Weight::Exit, any successor outside the loop.
  767. ///           The weight, the successor, and its category are stored in \a
  768. ///           Distribution.  There can be multiple edges to each successor.
  769. ///
  770. ///         - If there's a backedge to a non-header, there's an irreducible SCC.
  771. ///           The usual flow is temporarily aborted.  \a
  772. ///           computeIrreducibleMass() finds the irreducible SCCs within the
  773. ///           loop, packages them up, and restarts the flow.
  774. ///
  775. ///         - Normalize the distribution:  scale weights down so that their sum
  776. ///           is 32-bits, and coalesce multiple edges to the same node.
  777. ///
  778. ///         - Distribute the mass accordingly, dithering to minimize mass loss,
  779. ///           as described in \a distributeMass().
  780. ///
  781. ///     In the case of irreducible loops, instead of a single loop header,
  782. ///     there will be several. The computation of backedge masses is similar
  783. ///     but instead of having a single backedge mass, there will be one
  784. ///     backedge per loop header. In these cases, each backedge will carry
  785. ///     a mass proportional to the edge weights along the corresponding
  786. ///     path.
  787. ///
  788. ///     At the end of propagation, the full mass assigned to the loop will be
  789. ///     distributed among the loop headers proportionally according to the
  790. ///     mass flowing through their backedges.
  791. ///
  792. ///     Finally, calculate the loop scale from the accumulated backedge mass.
  793. ///
  794. ///  3. Distribute mass in the function (\a computeMassInFunction()).
  795. ///
  796. ///     Finally, distribute mass through the DAG resulting from packaging all
  797. ///     loops in the function.  This uses the same algorithm as distributing
  798. ///     mass in a loop, except that there are no exit or backedge edges.
  799. ///
  800. ///  4. Unpackage loops (\a unwrapLoops()).
  801. ///
  802. ///     Initialize each block's frequency to a floating point representation of
  803. ///     its mass.
  804. ///
  805. ///     Visit loops top-down, scaling the frequencies of its immediate members
  806. ///     by the loop's pseudo-node's frequency.
  807. ///
  808. ///  5. Convert frequencies to a 64-bit range (\a finalizeMetrics()).
  809. ///
  810. ///     Using the min and max frequencies as a guide, translate floating point
  811. ///     frequencies to an appropriate range in uint64_t.
  812. ///
  813. /// It has some known flaws.
  814. ///
  815. ///   - The model of irreducible control flow is a rough approximation.
  816. ///
  817. ///     Modelling irreducible control flow exactly involves setting up and
  818. ///     solving a group of infinite geometric series.  Such precision is
  819. ///     unlikely to be worthwhile, since most of our algorithms give up on
  820. ///     irreducible control flow anyway.
  821. ///
  822. ///     Nevertheless, we might find that we need to get closer.  Here's a sort
  823. ///     of TODO list for the model with diminishing returns, to be completed as
  824. ///     necessary.
  825. ///
  826. ///       - The headers for the \a LoopData representing an irreducible SCC
  827. ///         include non-entry blocks.  When these extra blocks exist, they
  828. ///         indicate a self-contained irreducible sub-SCC.  We could treat them
  829. ///         as sub-loops, rather than arbitrarily shoving the problematic
  830. ///         blocks into the headers of the main irreducible SCC.
  831. ///
  832. ///       - Entry frequencies are assumed to be evenly split between the
  833. ///         headers of a given irreducible SCC, which is the only option if we
  834. ///         need to compute mass in the SCC before its parent loop.  Instead,
  835. ///         we could partially compute mass in the parent loop, and stop when
  836. ///         we get to the SCC.  Here, we have the correct ratio of entry
  837. ///         masses, which we can use to adjust their relative frequencies.
  838. ///         Compute mass in the SCC, and then continue propagation in the
  839. ///         parent.
  840. ///
  841. ///       - We can propagate mass iteratively through the SCC, for some fixed
  842. ///         number of iterations.  Each iteration starts by assigning the entry
  843. ///         blocks their backedge mass from the prior iteration.  The final
  844. ///         mass for each block (and each exit, and the total backedge mass
  845. ///         used for computing loop scale) is the sum of all iterations.
  846. ///         (Running this until fixed point would "solve" the geometric
  847. ///         series by simulation.)
  848. template <class BT> class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
  849.   // This is part of a workaround for a GCC 4.7 crash on lambdas.
  850.   friend struct bfi_detail::BlockEdgesAdder<BT>;
  851.  
  852.   using BlockT = typename bfi_detail::TypeMap<BT>::BlockT;
  853.   using BlockKeyT = typename bfi_detail::TypeMap<BT>::BlockKeyT;
  854.   using FunctionT = typename bfi_detail::TypeMap<BT>::FunctionT;
  855.   using BranchProbabilityInfoT =
  856.       typename bfi_detail::TypeMap<BT>::BranchProbabilityInfoT;
  857.   using LoopT = typename bfi_detail::TypeMap<BT>::LoopT;
  858.   using LoopInfoT = typename bfi_detail::TypeMap<BT>::LoopInfoT;
  859.   using Successor = GraphTraits<const BlockT *>;
  860.   using Predecessor = GraphTraits<Inverse<const BlockT *>>;
  861.   using BFICallbackVH =
  862.       bfi_detail::BFICallbackVH<BlockT, BlockFrequencyInfoImpl>;
  863.  
  864.   const BranchProbabilityInfoT *BPI = nullptr;
  865.   const LoopInfoT *LI = nullptr;
  866.   const FunctionT *F = nullptr;
  867.  
  868.   // All blocks in reverse postorder.
  869.   std::vector<const BlockT *> RPOT;
  870.   DenseMap<BlockKeyT, std::pair<BlockNode, BFICallbackVH>> Nodes;
  871.  
  872.   using rpot_iterator = typename std::vector<const BlockT *>::const_iterator;
  873.  
  874.   rpot_iterator rpot_begin() const { return RPOT.begin(); }
  875.   rpot_iterator rpot_end() const { return RPOT.end(); }
  876.  
  877.   size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
  878.  
  879.   BlockNode getNode(const rpot_iterator &I) const {
  880.     return BlockNode(getIndex(I));
  881.   }
  882.  
  883.   BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB).first; }
  884.  
  885.   const BlockT *getBlock(const BlockNode &Node) const {
  886.     assert(Node.Index < RPOT.size());
  887.     return RPOT[Node.Index];
  888.   }
  889.  
  890.   /// Run (and save) a post-order traversal.
  891.   ///
  892.   /// Saves a reverse post-order traversal of all the nodes in \a F.
  893.   void initializeRPOT();
  894.  
  895.   /// Initialize loop data.
  896.   ///
  897.   /// Build up \a Loops using \a LoopInfo.  \a LoopInfo gives us a mapping from
  898.   /// each block to the deepest loop it's in, but we need the inverse.  For each
  899.   /// loop, we store in reverse post-order its "immediate" members, defined as
  900.   /// the header, the headers of immediate sub-loops, and all other blocks in
  901.   /// the loop that are not in sub-loops.
  902.   void initializeLoops();
  903.  
  904.   /// Propagate to a block's successors.
  905.   ///
  906.   /// In the context of distributing mass through \c OuterLoop, divide the mass
  907.   /// currently assigned to \c Node between its successors.
  908.   ///
  909.   /// \return \c true unless there's an irreducible backedge.
  910.   bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node);
  911.  
  912.   /// Compute mass in a particular loop.
  913.   ///
  914.   /// Assign mass to \c Loop's header, and then for each block in \c Loop in
  915.   /// reverse post-order, distribute mass to its successors.  Only visits nodes
  916.   /// that have not been packaged into sub-loops.
  917.   ///
  918.   /// \pre \a computeMassInLoop() has been called for each subloop of \c Loop.
  919.   /// \return \c true unless there's an irreducible backedge.
  920.   bool computeMassInLoop(LoopData &Loop);
  921.  
  922.   /// Try to compute mass in the top-level function.
  923.   ///
  924.   /// Assign mass to the entry block, and then for each block in reverse
  925.   /// post-order, distribute mass to its successors.  Skips nodes that have
  926.   /// been packaged into loops.
  927.   ///
  928.   /// \pre \a computeMassInLoops() has been called.
  929.   /// \return \c true unless there's an irreducible backedge.
  930.   bool tryToComputeMassInFunction();
  931.  
  932.   /// Compute mass in (and package up) irreducible SCCs.
  933.   ///
  934.   /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front
  935.   /// of \c Insert), and call \a computeMassInLoop() on each of them.
  936.   ///
  937.   /// If \c OuterLoop is \c nullptr, it refers to the top-level function.
  938.   ///
  939.   /// \pre \a computeMassInLoop() has been called for each subloop of \c
  940.   /// OuterLoop.
  941.   /// \pre \c Insert points at the last loop successfully processed by \a
  942.   /// computeMassInLoop().
  943.   /// \pre \c OuterLoop has irreducible SCCs.
  944.   void computeIrreducibleMass(LoopData *OuterLoop,
  945.                               std::list<LoopData>::iterator Insert);
  946.  
  947.   /// Compute mass in all loops.
  948.   ///
  949.   /// For each loop bottom-up, call \a computeMassInLoop().
  950.   ///
  951.   /// \a computeMassInLoop() aborts (and returns \c false) on loops that
  952.   /// contain a irreducible sub-SCCs.  Use \a computeIrreducibleMass() and then
  953.   /// re-enter \a computeMassInLoop().
  954.   ///
  955.   /// \post \a computeMassInLoop() has returned \c true for every loop.
  956.   void computeMassInLoops();
  957.  
  958.   /// Compute mass in the top-level function.
  959.   ///
  960.   /// Uses \a tryToComputeMassInFunction() and \a computeIrreducibleMass() to
  961.   /// compute mass in the top-level function.
  962.   ///
  963.   /// \post \a tryToComputeMassInFunction() has returned \c true.
  964.   void computeMassInFunction();
  965.  
  966.   std::string getBlockName(const BlockNode &Node) const override {
  967.     return bfi_detail::getBlockName(getBlock(Node));
  968.   }
  969.  
  970.   /// The current implementation for computing relative block frequencies does
  971.   /// not handle correctly control-flow graphs containing irreducible loops. To
  972.   /// resolve the problem, we apply a post-processing step, which iteratively
  973.   /// updates block frequencies based on the frequencies of their predesessors.
  974.   /// This corresponds to finding the stationary point of the Markov chain by
  975.   /// an iterative method aka "PageRank computation".
  976.   /// The algorithm takes at most O(|E| * IterativeBFIMaxIterations) steps but
  977.   /// typically converges faster.
  978.   ///
  979.   /// Decide whether we want to apply iterative inference for a given function.
  980.   bool needIterativeInference() const;
  981.  
  982.   /// Apply an iterative post-processing to infer correct counts for irr loops.
  983.   void applyIterativeInference();
  984.  
  985.   using ProbMatrixType = std::vector<std::vector<std::pair<size_t, Scaled64>>>;
  986.  
  987.   /// Run iterative inference for a probability matrix and initial frequencies.
  988.   void iterativeInference(const ProbMatrixType &ProbMatrix,
  989.                           std::vector<Scaled64> &Freq) const;
  990.  
  991.   /// Find all blocks to apply inference on, that is, reachable from the entry
  992.   /// and backward reachable from exists along edges with positive probability.
  993.   void findReachableBlocks(std::vector<const BlockT *> &Blocks) const;
  994.  
  995.   /// Build a matrix of probabilities with transitions (edges) between the
  996.   /// blocks: ProbMatrix[I] holds pairs (J, P), where Pr[J -> I | J] = P
  997.   void initTransitionProbabilities(
  998.       const std::vector<const BlockT *> &Blocks,
  999.       const DenseMap<const BlockT *, size_t> &BlockIndex,
  1000.       ProbMatrixType &ProbMatrix) const;
  1001.  
  1002. #ifndef NDEBUG
  1003.   /// Compute the discrepancy between current block frequencies and the
  1004.   /// probability matrix.
  1005.   Scaled64 discrepancy(const ProbMatrixType &ProbMatrix,
  1006.                        const std::vector<Scaled64> &Freq) const;
  1007. #endif
  1008.  
  1009. public:
  1010.   BlockFrequencyInfoImpl() = default;
  1011.  
  1012.   const FunctionT *getFunction() const { return F; }
  1013.  
  1014.   void calculate(const FunctionT &F, const BranchProbabilityInfoT &BPI,
  1015.                  const LoopInfoT &LI);
  1016.  
  1017.   using BlockFrequencyInfoImplBase::getEntryFreq;
  1018.  
  1019.   BlockFrequency getBlockFreq(const BlockT *BB) const {
  1020.     return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
  1021.   }
  1022.  
  1023.   std::optional<uint64_t>
  1024.   getBlockProfileCount(const Function &F, const BlockT *BB,
  1025.                        bool AllowSynthetic = false) const {
  1026.     return BlockFrequencyInfoImplBase::getBlockProfileCount(F, getNode(BB),
  1027.                                                             AllowSynthetic);
  1028.   }
  1029.  
  1030.   std::optional<uint64_t>
  1031.   getProfileCountFromFreq(const Function &F, uint64_t Freq,
  1032.                           bool AllowSynthetic = false) const {
  1033.     return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq,
  1034.                                                                AllowSynthetic);
  1035.   }
  1036.  
  1037.   bool isIrrLoopHeader(const BlockT *BB) {
  1038.     return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
  1039.   }
  1040.  
  1041.   void setBlockFreq(const BlockT *BB, uint64_t Freq);
  1042.  
  1043.   void forgetBlock(const BlockT *BB) {
  1044.     // We don't erase corresponding items from `Freqs`, `RPOT` and other to
  1045.     // avoid invalidating indices. Doing so would have saved some memory, but
  1046.     // it's not worth it.
  1047.     Nodes.erase(BB);
  1048.   }
  1049.  
  1050.   Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
  1051.     return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
  1052.   }
  1053.  
  1054.   const BranchProbabilityInfoT &getBPI() const { return *BPI; }
  1055.  
  1056.   /// Print the frequencies for the current function.
  1057.   ///
  1058.   /// Prints the frequencies for the blocks in the current function.
  1059.   ///
  1060.   /// Blocks are printed in the natural iteration order of the function, rather
  1061.   /// than reverse post-order.  This provides two advantages:  writing -analyze
  1062.   /// tests is easier (since blocks come out in source order), and even
  1063.   /// unreachable blocks are printed.
  1064.   ///
  1065.   /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
  1066.   /// we need to override it here.
  1067.   raw_ostream &print(raw_ostream &OS) const override;
  1068.  
  1069.   using BlockFrequencyInfoImplBase::dump;
  1070.   using BlockFrequencyInfoImplBase::printBlockFreq;
  1071.  
  1072.   raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
  1073.     return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
  1074.   }
  1075.  
  1076.   void verifyMatch(BlockFrequencyInfoImpl<BT> &Other) const;
  1077. };
  1078.  
  1079. namespace bfi_detail {
  1080.  
  1081. template <class BFIImplT>
  1082. class BFICallbackVH<BasicBlock, BFIImplT> : public CallbackVH {
  1083.   BFIImplT *BFIImpl;
  1084.  
  1085. public:
  1086.   BFICallbackVH() = default;
  1087.  
  1088.   BFICallbackVH(const BasicBlock *BB, BFIImplT *BFIImpl)
  1089.       : CallbackVH(BB), BFIImpl(BFIImpl) {}
  1090.  
  1091.   virtual ~BFICallbackVH() = default;
  1092.  
  1093.   void deleted() override {
  1094.     BFIImpl->forgetBlock(cast<BasicBlock>(getValPtr()));
  1095.   }
  1096. };
  1097.  
  1098. /// Dummy implementation since MachineBasicBlocks aren't Values, so ValueHandles
  1099. /// don't apply to them.
  1100. template <class BFIImplT>
  1101. class BFICallbackVH<MachineBasicBlock, BFIImplT> {
  1102. public:
  1103.   BFICallbackVH() = default;
  1104.   BFICallbackVH(const MachineBasicBlock *, BFIImplT *) {}
  1105. };
  1106.  
  1107. } // end namespace bfi_detail
  1108.  
  1109. template <class BT>
  1110. void BlockFrequencyInfoImpl<BT>::calculate(const FunctionT &F,
  1111.                                            const BranchProbabilityInfoT &BPI,
  1112.                                            const LoopInfoT &LI) {
  1113.   // Save the parameters.
  1114.   this->BPI = &BPI;
  1115.   this->LI = &LI;
  1116.   this->F = &F;
  1117.  
  1118.   // Clean up left-over data structures.
  1119.   BlockFrequencyInfoImplBase::clear();
  1120.   RPOT.clear();
  1121.   Nodes.clear();
  1122.  
  1123.   // Initialize.
  1124.   LLVM_DEBUG(dbgs() << "\nblock-frequency: " << F.getName()
  1125.                     << "\n================="
  1126.                     << std::string(F.getName().size(), '=') << "\n");
  1127.   initializeRPOT();
  1128.   initializeLoops();
  1129.  
  1130.   // Visit loops in post-order to find the local mass distribution, and then do
  1131.   // the full function.
  1132.   computeMassInLoops();
  1133.   computeMassInFunction();
  1134.   unwrapLoops();
  1135.   // Apply a post-processing step improving computed frequencies for functions
  1136.   // with irreducible loops.
  1137.   if (needIterativeInference())
  1138.     applyIterativeInference();
  1139.   finalizeMetrics();
  1140.  
  1141.   if (CheckBFIUnknownBlockQueries) {
  1142.     // To detect BFI queries for unknown blocks, add entries for unreachable
  1143.     // blocks, if any. This is to distinguish between known/existing unreachable
  1144.     // blocks and unknown blocks.
  1145.     for (const BlockT &BB : F)
  1146.       if (!Nodes.count(&BB))
  1147.         setBlockFreq(&BB, 0);
  1148.   }
  1149. }
  1150.  
  1151. template <class BT>
  1152. void BlockFrequencyInfoImpl<BT>::setBlockFreq(const BlockT *BB, uint64_t Freq) {
  1153.   if (Nodes.count(BB))
  1154.     BlockFrequencyInfoImplBase::setBlockFreq(getNode(BB), Freq);
  1155.   else {
  1156.     // If BB is a newly added block after BFI is done, we need to create a new
  1157.     // BlockNode for it assigned with a new index. The index can be determined
  1158.     // by the size of Freqs.
  1159.     BlockNode NewNode(Freqs.size());
  1160.     Nodes[BB] = {NewNode, BFICallbackVH(BB, this)};
  1161.     Freqs.emplace_back();
  1162.     BlockFrequencyInfoImplBase::setBlockFreq(NewNode, Freq);
  1163.   }
  1164. }
  1165.  
  1166. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeRPOT() {
  1167.   const BlockT *Entry = &F->front();
  1168.   RPOT.reserve(F->size());
  1169.   std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
  1170.   std::reverse(RPOT.begin(), RPOT.end());
  1171.  
  1172.   assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
  1173.          "More nodes in function than Block Frequency Info supports");
  1174.  
  1175.   LLVM_DEBUG(dbgs() << "reverse-post-order-traversal\n");
  1176.   for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
  1177.     BlockNode Node = getNode(I);
  1178.     LLVM_DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node)
  1179.                       << "\n");
  1180.     Nodes[*I] = {Node, BFICallbackVH(*I, this)};
  1181.   }
  1182.  
  1183.   Working.reserve(RPOT.size());
  1184.   for (size_t Index = 0; Index < RPOT.size(); ++Index)
  1185.     Working.emplace_back(Index);
  1186.   Freqs.resize(RPOT.size());
  1187. }
  1188.  
  1189. template <class BT> void BlockFrequencyInfoImpl<BT>::initializeLoops() {
  1190.   LLVM_DEBUG(dbgs() << "loop-detection\n");
  1191.   if (LI->empty())
  1192.     return;
  1193.  
  1194.   // Visit loops top down and assign them an index.
  1195.   std::deque<std::pair<const LoopT *, LoopData *>> Q;
  1196.   for (const LoopT *L : *LI)
  1197.     Q.emplace_back(L, nullptr);
  1198.   while (!Q.empty()) {
  1199.     const LoopT *Loop = Q.front().first;
  1200.     LoopData *Parent = Q.front().second;
  1201.     Q.pop_front();
  1202.  
  1203.     BlockNode Header = getNode(Loop->getHeader());
  1204.     assert(Header.isValid());
  1205.  
  1206.     Loops.emplace_back(Parent, Header);
  1207.     Working[Header.Index].Loop = &Loops.back();
  1208.     LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
  1209.  
  1210.     for (const LoopT *L : *Loop)
  1211.       Q.emplace_back(L, &Loops.back());
  1212.   }
  1213.  
  1214.   // Visit nodes in reverse post-order and add them to their deepest containing
  1215.   // loop.
  1216.   for (size_t Index = 0; Index < RPOT.size(); ++Index) {
  1217.     // Loop headers have already been mostly mapped.
  1218.     if (Working[Index].isLoopHeader()) {
  1219.       LoopData *ContainingLoop = Working[Index].getContainingLoop();
  1220.       if (ContainingLoop)
  1221.         ContainingLoop->Nodes.push_back(Index);
  1222.       continue;
  1223.     }
  1224.  
  1225.     const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
  1226.     if (!Loop)
  1227.       continue;
  1228.  
  1229.     // Add this node to its containing loop's member list.
  1230.     BlockNode Header = getNode(Loop->getHeader());
  1231.     assert(Header.isValid());
  1232.     const auto &HeaderData = Working[Header.Index];
  1233.     assert(HeaderData.isLoopHeader());
  1234.  
  1235.     Working[Index].Loop = HeaderData.Loop;
  1236.     HeaderData.Loop->Nodes.push_back(Index);
  1237.     LLVM_DEBUG(dbgs() << " - loop = " << getBlockName(Header)
  1238.                       << ": member = " << getBlockName(Index) << "\n");
  1239.   }
  1240. }
  1241.  
  1242. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInLoops() {
  1243.   // Visit loops with the deepest first, and the top-level loops last.
  1244.   for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1245.     if (computeMassInLoop(*L))
  1246.       continue;
  1247.     auto Next = std::next(L);
  1248.     computeIrreducibleMass(&*L, L.base());
  1249.     L = std::prev(Next);
  1250.     if (computeMassInLoop(*L))
  1251.       continue;
  1252.     llvm_unreachable("unhandled irreducible control flow");
  1253.   }
  1254. }
  1255.  
  1256. template <class BT>
  1257. bool BlockFrequencyInfoImpl<BT>::computeMassInLoop(LoopData &Loop) {
  1258.   // Compute mass in loop.
  1259.   LLVM_DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
  1260.  
  1261.   if (Loop.isIrreducible()) {
  1262.     LLVM_DEBUG(dbgs() << "isIrreducible = true\n");
  1263.     Distribution Dist;
  1264.     unsigned NumHeadersWithWeight = 0;
  1265.     std::optional<uint64_t> MinHeaderWeight;
  1266.     DenseSet<uint32_t> HeadersWithoutWeight;
  1267.     HeadersWithoutWeight.reserve(Loop.NumHeaders);
  1268.     for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
  1269.       auto &HeaderNode = Loop.Nodes[H];
  1270.       const BlockT *Block = getBlock(HeaderNode);
  1271.       IsIrrLoopHeader.set(Loop.Nodes[H].Index);
  1272.       std::optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
  1273.       if (!HeaderWeight) {
  1274.         LLVM_DEBUG(dbgs() << "Missing irr loop header metadata on "
  1275.                           << getBlockName(HeaderNode) << "\n");
  1276.         HeadersWithoutWeight.insert(H);
  1277.         continue;
  1278.       }
  1279.       LLVM_DEBUG(dbgs() << getBlockName(HeaderNode)
  1280.                         << " has irr loop header weight " << *HeaderWeight
  1281.                         << "\n");
  1282.       NumHeadersWithWeight++;
  1283.       uint64_t HeaderWeightValue = *HeaderWeight;
  1284.       if (!MinHeaderWeight || HeaderWeightValue < MinHeaderWeight)
  1285.         MinHeaderWeight = HeaderWeightValue;
  1286.       if (HeaderWeightValue) {
  1287.         Dist.addLocal(HeaderNode, HeaderWeightValue);
  1288.       }
  1289.     }
  1290.     // As a heuristic, if some headers don't have a weight, give them the
  1291.     // minimum weight seen (not to disrupt the existing trends too much by
  1292.     // using a weight that's in the general range of the other headers' weights,
  1293.     // and the minimum seems to perform better than the average.)
  1294.     // FIXME: better update in the passes that drop the header weight.
  1295.     // If no headers have a weight, give them even weight (use weight 1).
  1296.     if (!MinHeaderWeight)
  1297.       MinHeaderWeight = 1;
  1298.     for (uint32_t H : HeadersWithoutWeight) {
  1299.       auto &HeaderNode = Loop.Nodes[H];
  1300.       assert(!getBlock(HeaderNode)->getIrrLoopHeaderWeight() &&
  1301.              "Shouldn't have a weight metadata");
  1302.       uint64_t MinWeight = *MinHeaderWeight;
  1303.       LLVM_DEBUG(dbgs() << "Giving weight " << MinWeight << " to "
  1304.                         << getBlockName(HeaderNode) << "\n");
  1305.       if (MinWeight)
  1306.         Dist.addLocal(HeaderNode, MinWeight);
  1307.     }
  1308.     distributeIrrLoopHeaderMass(Dist);
  1309.     for (const BlockNode &M : Loop.Nodes)
  1310.       if (!propagateMassToSuccessors(&Loop, M))
  1311.         llvm_unreachable("unhandled irreducible control flow");
  1312.     if (NumHeadersWithWeight == 0)
  1313.       // No headers have a metadata. Adjust header mass.
  1314.       adjustLoopHeaderMass(Loop);
  1315.   } else {
  1316.     Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
  1317.     if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
  1318.       llvm_unreachable("irreducible control flow to loop header!?");
  1319.     for (const BlockNode &M : Loop.members())
  1320.       if (!propagateMassToSuccessors(&Loop, M))
  1321.         // Irreducible backedge.
  1322.         return false;
  1323.   }
  1324.  
  1325.   computeLoopScale(Loop);
  1326.   packageLoop(Loop);
  1327.   return true;
  1328. }
  1329.  
  1330. template <class BT>
  1331. bool BlockFrequencyInfoImpl<BT>::tryToComputeMassInFunction() {
  1332.   // Compute mass in function.
  1333.   LLVM_DEBUG(dbgs() << "compute-mass-in-function\n");
  1334.   assert(!Working.empty() && "no blocks in function");
  1335.   assert(!Working[0].isLoopHeader() && "entry block is a loop header");
  1336.  
  1337.   Working[0].getMass() = BlockMass::getFull();
  1338.   for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
  1339.     // Check for nodes that have been packaged.
  1340.     BlockNode Node = getNode(I);
  1341.     if (Working[Node.Index].isPackaged())
  1342.       continue;
  1343.  
  1344.     if (!propagateMassToSuccessors(nullptr, Node))
  1345.       return false;
  1346.   }
  1347.   return true;
  1348. }
  1349.  
  1350. template <class BT> void BlockFrequencyInfoImpl<BT>::computeMassInFunction() {
  1351.   if (tryToComputeMassInFunction())
  1352.     return;
  1353.   computeIrreducibleMass(nullptr, Loops.begin());
  1354.   if (tryToComputeMassInFunction())
  1355.     return;
  1356.   llvm_unreachable("unhandled irreducible control flow");
  1357. }
  1358.  
  1359. template <class BT>
  1360. bool BlockFrequencyInfoImpl<BT>::needIterativeInference() const {
  1361.   if (!UseIterativeBFIInference)
  1362.     return false;
  1363.   if (!F->getFunction().hasProfileData())
  1364.     return false;
  1365.   // Apply iterative inference only if the function contains irreducible loops;
  1366.   // otherwise, computed block frequencies are reasonably correct.
  1367.   for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) {
  1368.     if (L->isIrreducible())
  1369.       return true;
  1370.   }
  1371.   return false;
  1372. }
  1373.  
  1374. template <class BT> void BlockFrequencyInfoImpl<BT>::applyIterativeInference() {
  1375.   // Extract blocks for processing: a block is considered for inference iff it
  1376.   // can be reached from the entry by edges with a positive probability.
  1377.   // Non-processed blocks are assigned with the zero frequency and are ignored
  1378.   // in the computation
  1379.   std::vector<const BlockT *> ReachableBlocks;
  1380.   findReachableBlocks(ReachableBlocks);
  1381.   if (ReachableBlocks.empty())
  1382.     return;
  1383.  
  1384.   // The map is used to to index successors/predecessors of reachable blocks in
  1385.   // the ReachableBlocks vector
  1386.   DenseMap<const BlockT *, size_t> BlockIndex;
  1387.   // Extract initial frequencies for the reachable blocks
  1388.   auto Freq = std::vector<Scaled64>(ReachableBlocks.size());
  1389.   Scaled64 SumFreq;
  1390.   for (size_t I = 0; I < ReachableBlocks.size(); I++) {
  1391.     const BlockT *BB = ReachableBlocks[I];
  1392.     BlockIndex[BB] = I;
  1393.     Freq[I] = getFloatingBlockFreq(BB);
  1394.     SumFreq += Freq[I];
  1395.   }
  1396.   assert(!SumFreq.isZero() && "empty initial block frequencies");
  1397.  
  1398.   LLVM_DEBUG(dbgs() << "Applying iterative inference for " << F->getName()
  1399.                     << " with " << ReachableBlocks.size() << " blocks\n");
  1400.  
  1401.   // Normalizing frequencies so they sum up to 1.0
  1402.   for (auto &Value : Freq) {
  1403.     Value /= SumFreq;
  1404.   }
  1405.  
  1406.   // Setting up edge probabilities using sparse matrix representation:
  1407.   // ProbMatrix[I] holds a vector of pairs (J, P) where Pr[J -> I | J] = P
  1408.   ProbMatrixType ProbMatrix;
  1409.   initTransitionProbabilities(ReachableBlocks, BlockIndex, ProbMatrix);
  1410.  
  1411.   // Run the propagation
  1412.   iterativeInference(ProbMatrix, Freq);
  1413.  
  1414.   // Assign computed frequency values
  1415.   for (const BlockT &BB : *F) {
  1416.     auto Node = getNode(&BB);
  1417.     if (!Node.isValid())
  1418.       continue;
  1419.     if (BlockIndex.count(&BB)) {
  1420.       Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]];
  1421.     } else {
  1422.       Freqs[Node.Index].Scaled = Scaled64::getZero();
  1423.     }
  1424.   }
  1425. }
  1426.  
  1427. template <class BT>
  1428. void BlockFrequencyInfoImpl<BT>::iterativeInference(
  1429.     const ProbMatrixType &ProbMatrix, std::vector<Scaled64> &Freq) const {
  1430.   assert(0.0 < IterativeBFIPrecision && IterativeBFIPrecision < 1.0 &&
  1431.          "incorrectly specified precision");
  1432.   // Convert double precision to Scaled64
  1433.   const auto Precision =
  1434.       Scaled64::getInverse(static_cast<uint64_t>(1.0 / IterativeBFIPrecision));
  1435.   const size_t MaxIterations = IterativeBFIMaxIterationsPerBlock * Freq.size();
  1436.  
  1437. #ifndef NDEBUG
  1438.   LLVM_DEBUG(dbgs() << "  Initial discrepancy = "
  1439.                     << discrepancy(ProbMatrix, Freq).toString() << "\n");
  1440. #endif
  1441.  
  1442.   // Successors[I] holds unique sucessors of the I-th block
  1443.   auto Successors = std::vector<std::vector<size_t>>(Freq.size());
  1444.   for (size_t I = 0; I < Freq.size(); I++) {
  1445.     for (const auto &Jump : ProbMatrix[I]) {
  1446.       Successors[Jump.first].push_back(I);
  1447.     }
  1448.   }
  1449.  
  1450.   // To speedup computation, we maintain a set of "active" blocks whose
  1451.   // frequencies need to be updated based on the incoming edges.
  1452.   // The set is dynamic and changes after every update. Initially all blocks
  1453.   // with a positive frequency are active
  1454.   auto IsActive = BitVector(Freq.size(), false);
  1455.   std::queue<size_t> ActiveSet;
  1456.   for (size_t I = 0; I < Freq.size(); I++) {
  1457.     if (Freq[I] > 0) {
  1458.       ActiveSet.push(I);
  1459.       IsActive[I] = true;
  1460.     }
  1461.   }
  1462.  
  1463.   // Iterate over the blocks propagating frequencies
  1464.   size_t It = 0;
  1465.   while (It++ < MaxIterations && !ActiveSet.empty()) {
  1466.     size_t I = ActiveSet.front();
  1467.     ActiveSet.pop();
  1468.     IsActive[I] = false;
  1469.  
  1470.     // Compute a new frequency for the block: NewFreq := Freq \times ProbMatrix.
  1471.     // A special care is taken for self-edges that needs to be scaled by
  1472.     // (1.0 - SelfProb), where SelfProb is the sum of probabilities on the edges
  1473.     Scaled64 NewFreq;
  1474.     Scaled64 OneMinusSelfProb = Scaled64::getOne();
  1475.     for (const auto &Jump : ProbMatrix[I]) {
  1476.       if (Jump.first == I) {
  1477.         OneMinusSelfProb -= Jump.second;
  1478.       } else {
  1479.         NewFreq += Freq[Jump.first] * Jump.second;
  1480.       }
  1481.     }
  1482.     if (OneMinusSelfProb != Scaled64::getOne())
  1483.       NewFreq /= OneMinusSelfProb;
  1484.  
  1485.     // If the block's frequency has changed enough, then
  1486.     // make sure the block and its successors are in the active set
  1487.     auto Change = Freq[I] >= NewFreq ? Freq[I] - NewFreq : NewFreq - Freq[I];
  1488.     if (Change > Precision) {
  1489.       ActiveSet.push(I);
  1490.       IsActive[I] = true;
  1491.       for (size_t Succ : Successors[I]) {
  1492.         if (!IsActive[Succ]) {
  1493.           ActiveSet.push(Succ);
  1494.           IsActive[Succ] = true;
  1495.         }
  1496.       }
  1497.     }
  1498.  
  1499.     // Update the frequency for the block
  1500.     Freq[I] = NewFreq;
  1501.   }
  1502.  
  1503.   LLVM_DEBUG(dbgs() << "  Completed " << It << " inference iterations"
  1504.                     << format(" (%0.0f per block)", double(It) / Freq.size())
  1505.                     << "\n");
  1506. #ifndef NDEBUG
  1507.   LLVM_DEBUG(dbgs() << "  Final   discrepancy = "
  1508.                     << discrepancy(ProbMatrix, Freq).toString() << "\n");
  1509. #endif
  1510. }
  1511.  
  1512. template <class BT>
  1513. void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
  1514.     std::vector<const BlockT *> &Blocks) const {
  1515.   // Find all blocks to apply inference on, that is, reachable from the entry
  1516.   // along edges with non-zero probablities
  1517.   std::queue<const BlockT *> Queue;
  1518.   SmallPtrSet<const BlockT *, 8> Reachable;
  1519.   const BlockT *Entry = &F->front();
  1520.   Queue.push(Entry);
  1521.   Reachable.insert(Entry);
  1522.   while (!Queue.empty()) {
  1523.     const BlockT *SrcBB = Queue.front();
  1524.     Queue.pop();
  1525.     for (const BlockT *DstBB : children<const BlockT *>(SrcBB)) {
  1526.       auto EP = BPI->getEdgeProbability(SrcBB, DstBB);
  1527.       if (EP.isZero())
  1528.         continue;
  1529.       if (Reachable.insert(DstBB).second)
  1530.         Queue.push(DstBB);
  1531.     }
  1532.   }
  1533.  
  1534.   // Find all blocks to apply inference on, that is, backward reachable from
  1535.   // the entry along (backward) edges with non-zero probablities
  1536.   SmallPtrSet<const BlockT *, 8> InverseReachable;
  1537.   for (const BlockT &BB : *F) {
  1538.     // An exit block is a block without any successors
  1539.     bool HasSucc = GraphTraits<const BlockT *>::child_begin(&BB) !=
  1540.                    GraphTraits<const BlockT *>::child_end(&BB);
  1541.     if (!HasSucc && Reachable.count(&BB)) {
  1542.       Queue.push(&BB);
  1543.       InverseReachable.insert(&BB);
  1544.     }
  1545.   }
  1546.   while (!Queue.empty()) {
  1547.     const BlockT *SrcBB = Queue.front();
  1548.     Queue.pop();
  1549.     for (const BlockT *DstBB : children<Inverse<const BlockT *>>(SrcBB)) {
  1550.       auto EP = BPI->getEdgeProbability(DstBB, SrcBB);
  1551.       if (EP.isZero())
  1552.         continue;
  1553.       if (InverseReachable.insert(DstBB).second)
  1554.         Queue.push(DstBB);
  1555.     }
  1556.   }
  1557.  
  1558.   // Collect the result
  1559.   Blocks.reserve(F->size());
  1560.   for (const BlockT &BB : *F) {
  1561.     if (Reachable.count(&BB) && InverseReachable.count(&BB)) {
  1562.       Blocks.push_back(&BB);
  1563.     }
  1564.   }
  1565. }
  1566.  
  1567. template <class BT>
  1568. void BlockFrequencyInfoImpl<BT>::initTransitionProbabilities(
  1569.     const std::vector<const BlockT *> &Blocks,
  1570.     const DenseMap<const BlockT *, size_t> &BlockIndex,
  1571.     ProbMatrixType &ProbMatrix) const {
  1572.   const size_t NumBlocks = Blocks.size();
  1573.   auto Succs = std::vector<std::vector<std::pair<size_t, Scaled64>>>(NumBlocks);
  1574.   auto SumProb = std::vector<Scaled64>(NumBlocks);
  1575.  
  1576.   // Find unique successors and corresponding probabilities for every block
  1577.   for (size_t Src = 0; Src < NumBlocks; Src++) {
  1578.     const BlockT *BB = Blocks[Src];
  1579.     SmallPtrSet<const BlockT *, 2> UniqueSuccs;
  1580.     for (const auto SI : children<const BlockT *>(BB)) {
  1581.       // Ignore cold blocks
  1582.       if (BlockIndex.find(SI) == BlockIndex.end())
  1583.         continue;
  1584.       // Ignore parallel edges between BB and SI blocks
  1585.       if (!UniqueSuccs.insert(SI).second)
  1586.         continue;
  1587.       // Ignore jumps with zero probability
  1588.       auto EP = BPI->getEdgeProbability(BB, SI);
  1589.       if (EP.isZero())
  1590.         continue;
  1591.  
  1592.       auto EdgeProb =
  1593.           Scaled64::getFraction(EP.getNumerator(), EP.getDenominator());
  1594.       size_t Dst = BlockIndex.find(SI)->second;
  1595.       Succs[Src].push_back(std::make_pair(Dst, EdgeProb));
  1596.       SumProb[Src] += EdgeProb;
  1597.     }
  1598.   }
  1599.  
  1600.   // Add transitions for every jump with positive branch probability
  1601.   ProbMatrix = ProbMatrixType(NumBlocks);
  1602.   for (size_t Src = 0; Src < NumBlocks; Src++) {
  1603.     // Ignore blocks w/o successors
  1604.     if (Succs[Src].empty())
  1605.       continue;
  1606.  
  1607.     assert(!SumProb[Src].isZero() && "Zero sum probability of non-exit block");
  1608.     for (auto &Jump : Succs[Src]) {
  1609.       size_t Dst = Jump.first;
  1610.       Scaled64 Prob = Jump.second;
  1611.       ProbMatrix[Dst].push_back(std::make_pair(Src, Prob / SumProb[Src]));
  1612.     }
  1613.   }
  1614.  
  1615.   // Add transitions from sinks to the source
  1616.   size_t EntryIdx = BlockIndex.find(&F->front())->second;
  1617.   for (size_t Src = 0; Src < NumBlocks; Src++) {
  1618.     if (Succs[Src].empty()) {
  1619.       ProbMatrix[EntryIdx].push_back(std::make_pair(Src, Scaled64::getOne()));
  1620.     }
  1621.   }
  1622. }
  1623.  
  1624. #ifndef NDEBUG
  1625. template <class BT>
  1626. BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl<BT>::discrepancy(
  1627.     const ProbMatrixType &ProbMatrix, const std::vector<Scaled64> &Freq) const {
  1628.   assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block");
  1629.   Scaled64 Discrepancy;
  1630.   for (size_t I = 0; I < ProbMatrix.size(); I++) {
  1631.     Scaled64 Sum;
  1632.     for (const auto &Jump : ProbMatrix[I]) {
  1633.       Sum += Freq[Jump.first] * Jump.second;
  1634.     }
  1635.     Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I];
  1636.   }
  1637.   // Normalizing by the frequency of the entry block
  1638.   return Discrepancy / Freq[0];
  1639. }
  1640. #endif
  1641.  
  1642. /// \note This should be a lambda, but that crashes GCC 4.7.
  1643. namespace bfi_detail {
  1644.  
  1645. template <class BT> struct BlockEdgesAdder {
  1646.   using BlockT = BT;
  1647.   using LoopData = BlockFrequencyInfoImplBase::LoopData;
  1648.   using Successor = GraphTraits<const BlockT *>;
  1649.  
  1650.   const BlockFrequencyInfoImpl<BT> &BFI;
  1651.  
  1652.   explicit BlockEdgesAdder(const BlockFrequencyInfoImpl<BT> &BFI)
  1653.       : BFI(BFI) {}
  1654.  
  1655.   void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr,
  1656.                   const LoopData *OuterLoop) {
  1657.     const BlockT *BB = BFI.RPOT[Irr.Node.Index];
  1658.     for (const auto Succ : children<const BlockT *>(BB))
  1659.       G.addEdge(Irr, BFI.getNode(Succ), OuterLoop);
  1660.   }
  1661. };
  1662.  
  1663. } // end namespace bfi_detail
  1664.  
  1665. template <class BT>
  1666. void BlockFrequencyInfoImpl<BT>::computeIrreducibleMass(
  1667.     LoopData *OuterLoop, std::list<LoopData>::iterator Insert) {
  1668.   LLVM_DEBUG(dbgs() << "analyze-irreducible-in-";
  1669.              if (OuterLoop) dbgs()
  1670.              << "loop: " << getLoopName(*OuterLoop) << "\n";
  1671.              else dbgs() << "function\n");
  1672.  
  1673.   using namespace bfi_detail;
  1674.  
  1675.   // Ideally, addBlockEdges() would be declared here as a lambda, but that
  1676.   // crashes GCC 4.7.
  1677.   BlockEdgesAdder<BT> addBlockEdges(*this);
  1678.   IrreducibleGraph G(*this, OuterLoop, addBlockEdges);
  1679.  
  1680.   for (auto &L : analyzeIrreducible(G, OuterLoop, Insert))
  1681.     computeMassInLoop(L);
  1682.  
  1683.   if (!OuterLoop)
  1684.     return;
  1685.   updateLoopWithIrreducible(*OuterLoop);
  1686. }
  1687.  
  1688. // A helper function that converts a branch probability into weight.
  1689. inline uint32_t getWeightFromBranchProb(const BranchProbability Prob) {
  1690.   return Prob.getNumerator();
  1691. }
  1692.  
  1693. template <class BT>
  1694. bool
  1695. BlockFrequencyInfoImpl<BT>::propagateMassToSuccessors(LoopData *OuterLoop,
  1696.                                                       const BlockNode &Node) {
  1697.   LLVM_DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
  1698.   // Calculate probability for successors.
  1699.   Distribution Dist;
  1700.   if (auto *Loop = Working[Node.Index].getPackagedLoop()) {
  1701.     assert(Loop != OuterLoop && "Cannot propagate mass in a packaged loop");
  1702.     if (!addLoopSuccessorsToDist(OuterLoop, *Loop, Dist))
  1703.       // Irreducible backedge.
  1704.       return false;
  1705.   } else {
  1706.     const BlockT *BB = getBlock(Node);
  1707.     for (auto SI = GraphTraits<const BlockT *>::child_begin(BB),
  1708.               SE = GraphTraits<const BlockT *>::child_end(BB);
  1709.          SI != SE; ++SI)
  1710.       if (!addToDist(
  1711.               Dist, OuterLoop, Node, getNode(*SI),
  1712.               getWeightFromBranchProb(BPI->getEdgeProbability(BB, SI))))
  1713.         // Irreducible backedge.
  1714.         return false;
  1715.   }
  1716.  
  1717.   // Distribute mass to successors, saving exit and backedge data in the
  1718.   // loop header.
  1719.   distributeMass(Node, OuterLoop, Dist);
  1720.   return true;
  1721. }
  1722.  
  1723. template <class BT>
  1724. raw_ostream &BlockFrequencyInfoImpl<BT>::print(raw_ostream &OS) const {
  1725.   if (!F)
  1726.     return OS;
  1727.   OS << "block-frequency-info: " << F->getName() << "\n";
  1728.   for (const BlockT &BB : *F) {
  1729.     OS << " - " << bfi_detail::getBlockName(&BB) << ": float = ";
  1730.     getFloatingBlockFreq(&BB).print(OS, 5)
  1731.         << ", int = " << getBlockFreq(&BB).getFrequency();
  1732.     if (std::optional<uint64_t> ProfileCount =
  1733.         BlockFrequencyInfoImplBase::getBlockProfileCount(
  1734.             F->getFunction(), getNode(&BB)))
  1735.       OS << ", count = " << *ProfileCount;
  1736.     if (std::optional<uint64_t> IrrLoopHeaderWeight =
  1737.             BB.getIrrLoopHeaderWeight())
  1738.       OS << ", irr_loop_header_weight = " << *IrrLoopHeaderWeight;
  1739.     OS << "\n";
  1740.   }
  1741.  
  1742.   // Add an extra newline for readability.
  1743.   OS << "\n";
  1744.   return OS;
  1745. }
  1746.  
  1747. template <class BT>
  1748. void BlockFrequencyInfoImpl<BT>::verifyMatch(
  1749.     BlockFrequencyInfoImpl<BT> &Other) const {
  1750.   bool Match = true;
  1751.   DenseMap<const BlockT *, BlockNode> ValidNodes;
  1752.   DenseMap<const BlockT *, BlockNode> OtherValidNodes;
  1753.   for (auto &Entry : Nodes) {
  1754.     const BlockT *BB = Entry.first;
  1755.     if (BB) {
  1756.       ValidNodes[BB] = Entry.second.first;
  1757.     }
  1758.   }
  1759.   for (auto &Entry : Other.Nodes) {
  1760.     const BlockT *BB = Entry.first;
  1761.     if (BB) {
  1762.       OtherValidNodes[BB] = Entry.second.first;
  1763.     }
  1764.   }
  1765.   unsigned NumValidNodes = ValidNodes.size();
  1766.   unsigned NumOtherValidNodes = OtherValidNodes.size();
  1767.   if (NumValidNodes != NumOtherValidNodes) {
  1768.     Match = false;
  1769.     dbgs() << "Number of blocks mismatch: " << NumValidNodes << " vs "
  1770.            << NumOtherValidNodes << "\n";
  1771.   } else {
  1772.     for (auto &Entry : ValidNodes) {
  1773.       const BlockT *BB = Entry.first;
  1774.       BlockNode Node = Entry.second;
  1775.       if (OtherValidNodes.count(BB)) {
  1776.         BlockNode OtherNode = OtherValidNodes[BB];
  1777.         const auto &Freq = Freqs[Node.Index];
  1778.         const auto &OtherFreq = Other.Freqs[OtherNode.Index];
  1779.         if (Freq.Integer != OtherFreq.Integer) {
  1780.           Match = false;
  1781.           dbgs() << "Freq mismatch: " << bfi_detail::getBlockName(BB) << " "
  1782.                  << Freq.Integer << " vs " << OtherFreq.Integer << "\n";
  1783.         }
  1784.       } else {
  1785.         Match = false;
  1786.         dbgs() << "Block " << bfi_detail::getBlockName(BB) << " index "
  1787.                << Node.Index << " does not exist in Other.\n";
  1788.       }
  1789.     }
  1790.     // If there's a valid node in OtherValidNodes that's not in ValidNodes,
  1791.     // either the above num check or the check on OtherValidNodes will fail.
  1792.   }
  1793.   if (!Match) {
  1794.     dbgs() << "This\n";
  1795.     print(dbgs());
  1796.     dbgs() << "Other\n";
  1797.     Other.print(dbgs());
  1798.   }
  1799.   assert(Match && "BFI mismatch");
  1800. }
  1801.  
  1802. // Graph trait base class for block frequency information graph
  1803. // viewer.
  1804.  
  1805. enum GVDAGType { GVDT_None, GVDT_Fraction, GVDT_Integer, GVDT_Count };
  1806.  
  1807. template <class BlockFrequencyInfoT, class BranchProbabilityInfoT>
  1808. struct BFIDOTGraphTraitsBase : public DefaultDOTGraphTraits {
  1809.   using GTraits = GraphTraits<BlockFrequencyInfoT *>;
  1810.   using NodeRef = typename GTraits::NodeRef;
  1811.   using EdgeIter = typename GTraits::ChildIteratorType;
  1812.   using NodeIter = typename GTraits::nodes_iterator;
  1813.  
  1814.   uint64_t MaxFrequency = 0;
  1815.  
  1816.   explicit BFIDOTGraphTraitsBase(bool isSimple = false)
  1817.       : DefaultDOTGraphTraits(isSimple) {}
  1818.  
  1819.   static StringRef getGraphName(const BlockFrequencyInfoT *G) {
  1820.     return G->getFunction()->getName();
  1821.   }
  1822.  
  1823.   std::string getNodeAttributes(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1824.                                 unsigned HotPercentThreshold = 0) {
  1825.     std::string Result;
  1826.     if (!HotPercentThreshold)
  1827.       return Result;
  1828.  
  1829.     // Compute MaxFrequency on the fly:
  1830.     if (!MaxFrequency) {
  1831.       for (NodeIter I = GTraits::nodes_begin(Graph),
  1832.                     E = GTraits::nodes_end(Graph);
  1833.            I != E; ++I) {
  1834.         NodeRef N = *I;
  1835.         MaxFrequency =
  1836.             std::max(MaxFrequency, Graph->getBlockFreq(N).getFrequency());
  1837.       }
  1838.     }
  1839.     BlockFrequency Freq = Graph->getBlockFreq(Node);
  1840.     BlockFrequency HotFreq =
  1841.         (BlockFrequency(MaxFrequency) *
  1842.          BranchProbability::getBranchProbability(HotPercentThreshold, 100));
  1843.  
  1844.     if (Freq < HotFreq)
  1845.       return Result;
  1846.  
  1847.     raw_string_ostream OS(Result);
  1848.     OS << "color=\"red\"";
  1849.     OS.flush();
  1850.     return Result;
  1851.   }
  1852.  
  1853.   std::string getNodeLabel(NodeRef Node, const BlockFrequencyInfoT *Graph,
  1854.                            GVDAGType GType, int layout_order = -1) {
  1855.     std::string Result;
  1856.     raw_string_ostream OS(Result);
  1857.  
  1858.     if (layout_order != -1)
  1859.       OS << Node->getName() << "[" << layout_order << "] : ";
  1860.     else
  1861.       OS << Node->getName() << " : ";
  1862.     switch (GType) {
  1863.     case GVDT_Fraction:
  1864.       Graph->printBlockFreq(OS, Node);
  1865.       break;
  1866.     case GVDT_Integer:
  1867.       OS << Graph->getBlockFreq(Node).getFrequency();
  1868.       break;
  1869.     case GVDT_Count: {
  1870.       auto Count = Graph->getBlockProfileCount(Node);
  1871.       if (Count)
  1872.         OS << *Count;
  1873.       else
  1874.         OS << "Unknown";
  1875.       break;
  1876.     }
  1877.     case GVDT_None:
  1878.       llvm_unreachable("If we are not supposed to render a graph we should "
  1879.                        "never reach this point.");
  1880.     }
  1881.     return Result;
  1882.   }
  1883.  
  1884.   std::string getEdgeAttributes(NodeRef Node, EdgeIter EI,
  1885.                                 const BlockFrequencyInfoT *BFI,
  1886.                                 const BranchProbabilityInfoT *BPI,
  1887.                                 unsigned HotPercentThreshold = 0) {
  1888.     std::string Str;
  1889.     if (!BPI)
  1890.       return Str;
  1891.  
  1892.     BranchProbability BP = BPI->getEdgeProbability(Node, EI);
  1893.     uint32_t N = BP.getNumerator();
  1894.     uint32_t D = BP.getDenominator();
  1895.     double Percent = 100.0 * N / D;
  1896.     raw_string_ostream OS(Str);
  1897.     OS << format("label=\"%.1f%%\"", Percent);
  1898.  
  1899.     if (HotPercentThreshold) {
  1900.       BlockFrequency EFreq = BFI->getBlockFreq(Node) * BP;
  1901.       BlockFrequency HotFreq = BlockFrequency(MaxFrequency) *
  1902.                                BranchProbability(HotPercentThreshold, 100);
  1903.  
  1904.       if (EFreq >= HotFreq) {
  1905.         OS << ",color=\"red\"";
  1906.       }
  1907.     }
  1908.  
  1909.     OS.flush();
  1910.     return Str;
  1911.   }
  1912. };
  1913.  
  1914. } // end namespace llvm
  1915.  
  1916. #undef DEBUG_TYPE
  1917.  
  1918. #endif // LLVM_ANALYSIS_BLOCKFREQUENCYINFOIMPL_H
  1919.