Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===- llvm/Analysis/VectorUtils.h - Vector utilities -----------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines some vectorizer utilities.
  10. //
  11. //===----------------------------------------------------------------------===//
  12.  
  13. #ifndef LLVM_ANALYSIS_VECTORUTILS_H
  14. #define LLVM_ANALYSIS_VECTORUTILS_H
  15.  
  16. #include "llvm/ADT/MapVector.h"
  17. #include "llvm/ADT/SmallVector.h"
  18. #include "llvm/Analysis/LoopAccessAnalysis.h"
  19. #include "llvm/Support/CheckedArithmetic.h"
  20.  
  21. namespace llvm {
  22. class TargetLibraryInfo;
  23.  
  24. /// Describes the type of Parameters
  25. enum class VFParamKind {
  26.   Vector,            // No semantic information.
  27.   OMP_Linear,        // declare simd linear(i)
  28.   OMP_LinearRef,     // declare simd linear(ref(i))
  29.   OMP_LinearVal,     // declare simd linear(val(i))
  30.   OMP_LinearUVal,    // declare simd linear(uval(i))
  31.   OMP_LinearPos,     // declare simd linear(i:c) uniform(c)
  32.   OMP_LinearValPos,  // declare simd linear(val(i:c)) uniform(c)
  33.   OMP_LinearRefPos,  // declare simd linear(ref(i:c)) uniform(c)
  34.   OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c)
  35.   OMP_Uniform,       // declare simd uniform(i)
  36.   GlobalPredicate,   // Global logical predicate that acts on all lanes
  37.                      // of the input and output mask concurrently. For
  38.                      // example, it is implied by the `M` token in the
  39.                      // Vector Function ABI mangled name.
  40.   Unknown
  41. };
  42.  
  43. /// Describes the type of Instruction Set Architecture
  44. enum class VFISAKind {
  45.   AdvancedSIMD, // AArch64 Advanced SIMD (NEON)
  46.   SVE,          // AArch64 Scalable Vector Extension
  47.   SSE,          // x86 SSE
  48.   AVX,          // x86 AVX
  49.   AVX2,         // x86 AVX2
  50.   AVX512,       // x86 AVX512
  51.   LLVM,         // LLVM internal ISA for functions that are not
  52.   // attached to an existing ABI via name mangling.
  53.   Unknown // Unknown ISA
  54. };
  55.  
  56. /// Encapsulates information needed to describe a parameter.
  57. ///
  58. /// The description of the parameter is not linked directly to
  59. /// OpenMP or any other vector function description. This structure
  60. /// is extendible to handle other paradigms that describe vector
  61. /// functions and their parameters.
  62. struct VFParameter {
  63.   unsigned ParamPos;         // Parameter Position in Scalar Function.
  64.   VFParamKind ParamKind;     // Kind of Parameter.
  65.   int LinearStepOrPos = 0;   // Step or Position of the Parameter.
  66.   Align Alignment = Align(); // Optional alignment in bytes, defaulted to 1.
  67.  
  68.   // Comparison operator.
  69.   bool operator==(const VFParameter &Other) const {
  70.     return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) ==
  71.            std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos,
  72.                     Other.Alignment);
  73.   }
  74. };
  75.  
  76. /// Contains the information about the kind of vectorization
  77. /// available.
  78. ///
  79. /// This object in independent on the paradigm used to
  80. /// represent vector functions. in particular, it is not attached to
  81. /// any target-specific ABI.
  82. struct VFShape {
  83.   ElementCount VF;                        // Vectorization factor.
  84.   SmallVector<VFParameter, 8> Parameters; // List of parameter information.
  85.   // Comparison operator.
  86.   bool operator==(const VFShape &Other) const {
  87.     return std::tie(VF, Parameters) == std::tie(Other.VF, Other.Parameters);
  88.   }
  89.  
  90.   /// Update the parameter in position P.ParamPos to P.
  91.   void updateParam(VFParameter P) {
  92.     assert(P.ParamPos < Parameters.size() && "Invalid parameter position.");
  93.     Parameters[P.ParamPos] = P;
  94.     assert(hasValidParameterList() && "Invalid parameter list");
  95.   }
  96.  
  97.   // Retrieve the VFShape that can be used to map a (scalar) function to itself,
  98.   // with VF = 1.
  99.   static VFShape getScalarShape(const CallInst &CI) {
  100.     return VFShape::get(CI, ElementCount::getFixed(1),
  101.                         /*HasGlobalPredicate*/ false);
  102.   }
  103.  
  104.   // Retrieve the basic vectorization shape of the function, where all
  105.   // parameters are mapped to VFParamKind::Vector with \p EC
  106.   // lanes. Specifies whether the function has a Global Predicate
  107.   // argument via \p HasGlobalPred.
  108.   static VFShape get(const CallInst &CI, ElementCount EC, bool HasGlobalPred) {
  109.     SmallVector<VFParameter, 8> Parameters;
  110.     for (unsigned I = 0; I < CI.arg_size(); ++I)
  111.       Parameters.push_back(VFParameter({I, VFParamKind::Vector}));
  112.     if (HasGlobalPred)
  113.       Parameters.push_back(
  114.           VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate}));
  115.  
  116.     return {EC, Parameters};
  117.   }
  118.   /// Validation check on the Parameters in the VFShape.
  119.   bool hasValidParameterList() const;
  120. };
  121.  
  122. /// Holds the VFShape for a specific scalar to vector function mapping.
  123. struct VFInfo {
  124.   VFShape Shape;          /// Classification of the vector function.
  125.   std::string ScalarName; /// Scalar Function Name.
  126.   std::string VectorName; /// Vector Function Name associated to this VFInfo.
  127.   VFISAKind ISA;          /// Instruction Set Architecture.
  128. };
  129.  
  130. namespace VFABI {
  131. /// LLVM Internal VFABI ISA token for vector functions.
  132. static constexpr char const *_LLVM_ = "_LLVM_";
  133. /// Prefix for internal name redirection for vector function that
  134. /// tells the compiler to scalarize the call using the scalar name
  135. /// of the function. For example, a mangled name like
  136. /// `_ZGV_LLVM_N2v_foo(_LLVM_Scalarize_foo)` would tell the
  137. /// vectorizer to vectorize the scalar call `foo`, and to scalarize
  138. /// it once vectorization is done.
  139. static constexpr char const *_LLVM_Scalarize_ = "_LLVM_Scalarize_";
  140.  
  141. /// Function to construct a VFInfo out of a mangled names in the
  142. /// following format:
  143. ///
  144. /// <VFABI_name>{(<redirection>)}
  145. ///
  146. /// where <VFABI_name> is the name of the vector function, mangled according
  147. /// to the rules described in the Vector Function ABI of the target vector
  148. /// extension (or <isa> from now on). The <VFABI_name> is in the following
  149. /// format:
  150. ///
  151. /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
  152. ///
  153. /// This methods support demangling rules for the following <isa>:
  154. ///
  155. /// * AArch64: https://developer.arm.com/docs/101129/latest
  156. ///
  157. /// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and
  158. ///  https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
  159. ///
  160. /// \param MangledName -> input string in the format
  161. /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)].
  162. /// \param M -> Module used to retrieve informations about the vector
  163. /// function that are not possible to retrieve from the mangled
  164. /// name. At the moment, this parameter is needed only to retrieve the
  165. /// Vectorization Factor of scalable vector functions from their
  166. /// respective IR declarations.
  167. std::optional<VFInfo> tryDemangleForVFABI(StringRef MangledName,
  168.                                           const Module &M);
  169.  
  170. /// This routine mangles the given VectorName according to the LangRef
  171. /// specification for vector-function-abi-variant attribute and is specific to
  172. /// the TLI mappings. It is the responsibility of the caller to make sure that
  173. /// this is only used if all parameters in the vector function are vector type.
  174. /// This returned string holds scalar-to-vector mapping:
  175. ///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
  176. ///
  177. /// where:
  178. ///
  179. /// <isa> = "_LLVM_"
  180. /// <mask> = "N". Note: TLI does not support masked interfaces.
  181. /// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
  182. ///          field of the `VecDesc` struct. If the number of lanes is scalable
  183. ///          then 'x' is printed instead.
  184. /// <vparams> = "v", as many as are the numArgs.
  185. /// <scalarname> = the name of the scalar function.
  186. /// <vectorname> = the name of the vector function.
  187. std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName,
  188.                                 unsigned numArgs, ElementCount VF);
  189.  
  190. /// Retrieve the `VFParamKind` from a string token.
  191. VFParamKind getVFParamKindFromString(const StringRef Token);
  192.  
  193. // Name of the attribute where the variant mappings are stored.
  194. static constexpr char const *MappingsAttrName = "vector-function-abi-variant";
  195.  
  196. /// Populates a set of strings representing the Vector Function ABI variants
  197. /// associated to the CallInst CI. If the CI does not contain the
  198. /// vector-function-abi-variant attribute, we return without populating
  199. /// VariantMappings, i.e. callers of getVectorVariantNames need not check for
  200. /// the presence of the attribute (see InjectTLIMappings).
  201. void getVectorVariantNames(const CallInst &CI,
  202.                            SmallVectorImpl<std::string> &VariantMappings);
  203. } // end namespace VFABI
  204.  
  205. /// The Vector Function Database.
  206. ///
  207. /// Helper class used to find the vector functions associated to a
  208. /// scalar CallInst.
  209. class VFDatabase {
  210.   /// The Module of the CallInst CI.
  211.   const Module *M;
  212.   /// The CallInst instance being queried for scalar to vector mappings.
  213.   const CallInst &CI;
  214.   /// List of vector functions descriptors associated to the call
  215.   /// instruction.
  216.   const SmallVector<VFInfo, 8> ScalarToVectorMappings;
  217.  
  218.   /// Retrieve the scalar-to-vector mappings associated to the rule of
  219.   /// a vector Function ABI.
  220.   static void getVFABIMappings(const CallInst &CI,
  221.                                SmallVectorImpl<VFInfo> &Mappings) {
  222.     if (!CI.getCalledFunction())
  223.       return;
  224.  
  225.     const StringRef ScalarName = CI.getCalledFunction()->getName();
  226.  
  227.     SmallVector<std::string, 8> ListOfStrings;
  228.     // The check for the vector-function-abi-variant attribute is done when
  229.     // retrieving the vector variant names here.
  230.     VFABI::getVectorVariantNames(CI, ListOfStrings);
  231.     if (ListOfStrings.empty())
  232.       return;
  233.     for (const auto &MangledName : ListOfStrings) {
  234.       const std::optional<VFInfo> Shape =
  235.           VFABI::tryDemangleForVFABI(MangledName, *(CI.getModule()));
  236.       // A match is found via scalar and vector names, and also by
  237.       // ensuring that the variant described in the attribute has a
  238.       // corresponding definition or declaration of the vector
  239.       // function in the Module M.
  240.       if (Shape && (Shape->ScalarName == ScalarName)) {
  241.         assert(CI.getModule()->getFunction(Shape->VectorName) &&
  242.                "Vector function is missing.");
  243.         Mappings.push_back(*Shape);
  244.       }
  245.     }
  246.   }
  247.  
  248. public:
  249.   /// Retrieve all the VFInfo instances associated to the CallInst CI.
  250.   static SmallVector<VFInfo, 8> getMappings(const CallInst &CI) {
  251.     SmallVector<VFInfo, 8> Ret;
  252.  
  253.     // Get mappings from the Vector Function ABI variants.
  254.     getVFABIMappings(CI, Ret);
  255.  
  256.     // Other non-VFABI variants should be retrieved here.
  257.  
  258.     return Ret;
  259.   }
  260.  
  261.   /// Constructor, requires a CallInst instance.
  262.   VFDatabase(CallInst &CI)
  263.       : M(CI.getModule()), CI(CI),
  264.         ScalarToVectorMappings(VFDatabase::getMappings(CI)) {}
  265.   /// \defgroup VFDatabase query interface.
  266.   ///
  267.   /// @{
  268.   /// Retrieve the Function with VFShape \p Shape.
  269.   Function *getVectorizedFunction(const VFShape &Shape) const {
  270.     if (Shape == VFShape::getScalarShape(CI))
  271.       return CI.getCalledFunction();
  272.  
  273.     for (const auto &Info : ScalarToVectorMappings)
  274.       if (Info.Shape == Shape)
  275.         return M->getFunction(Info.VectorName);
  276.  
  277.     return nullptr;
  278.   }
  279.   /// @}
  280. };
  281.  
  282. template <typename T> class ArrayRef;
  283. class DemandedBits;
  284. class GetElementPtrInst;
  285. template <typename InstTy> class InterleaveGroup;
  286. class IRBuilderBase;
  287. class Loop;
  288. class ScalarEvolution;
  289. class TargetTransformInfo;
  290. class Type;
  291. class Value;
  292.  
  293. namespace Intrinsic {
  294. typedef unsigned ID;
  295. }
  296.  
  297. /// A helper function for converting Scalar types to vector types. If
  298. /// the incoming type is void, we return void. If the EC represents a
  299. /// scalar, we return the scalar type.
  300. inline Type *ToVectorTy(Type *Scalar, ElementCount EC) {
  301.   if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar())
  302.     return Scalar;
  303.   return VectorType::get(Scalar, EC);
  304. }
  305.  
  306. inline Type *ToVectorTy(Type *Scalar, unsigned VF) {
  307.   return ToVectorTy(Scalar, ElementCount::getFixed(VF));
  308. }
  309.  
  310. /// Identify if the intrinsic is trivially vectorizable.
  311. /// This method returns true if the intrinsic's argument types are all scalars
  312. /// for the scalar form of the intrinsic and all vectors (or scalars handled by
  313. /// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic.
  314. bool isTriviallyVectorizable(Intrinsic::ID ID);
  315.  
  316. /// Identifies if the vector form of the intrinsic has a scalar operand.
  317. bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
  318.                                         unsigned ScalarOpdIdx);
  319.  
  320. /// Identifies if the vector form of the intrinsic has a operand that has
  321. /// an overloaded type.
  322. bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx);
  323.  
  324. /// Returns intrinsic ID for call.
  325. /// For the input call instruction it finds mapping intrinsic and returns
  326. /// its intrinsic ID, in case it does not found it return not_intrinsic.
  327. Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
  328.                                           const TargetLibraryInfo *TLI);
  329.  
  330. /// Find the operand of the GEP that should be checked for consecutive
  331. /// stores. This ignores trailing indices that have no effect on the final
  332. /// pointer.
  333. unsigned getGEPInductionOperand(const GetElementPtrInst *Gep);
  334.  
  335. /// If the argument is a GEP, then returns the operand identified by
  336. /// getGEPInductionOperand. However, if there is some other non-loop-invariant
  337. /// operand, it returns that instead.
  338. Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
  339.  
  340. /// If a value has only one user that is a CastInst, return it.
  341. Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty);
  342.  
  343. /// Get the stride of a pointer access in a loop. Looks for symbolic
  344. /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
  345. Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp);
  346.  
  347. /// Given a vector and an element number, see if the scalar value is
  348. /// already around as a register, for example if it were inserted then extracted
  349. /// from the vector.
  350. Value *findScalarElement(Value *V, unsigned EltNo);
  351.  
  352. /// If all non-negative \p Mask elements are the same value, return that value.
  353. /// If all elements are negative (undefined) or \p Mask contains different
  354. /// non-negative values, return -1.
  355. int getSplatIndex(ArrayRef<int> Mask);
  356.  
  357. /// Get splat value if the input is a splat vector or return nullptr.
  358. /// The value may be extracted from a splat constants vector or from
  359. /// a sequence of instructions that broadcast a single value into a vector.
  360. Value *getSplatValue(const Value *V);
  361.  
  362. /// Return true if each element of the vector value \p V is poisoned or equal to
  363. /// every other non-poisoned element. If an index element is specified, either
  364. /// every element of the vector is poisoned or the element at that index is not
  365. /// poisoned and equal to every other non-poisoned element.
  366. /// This may be more powerful than the related getSplatValue() because it is
  367. /// not limited by finding a scalar source value to a splatted vector.
  368. bool isSplatValue(const Value *V, int Index = -1, unsigned Depth = 0);
  369.  
  370. /// Transform a shuffle mask's output demanded element mask into demanded
  371. /// element masks for the 2 operands, returns false if the mask isn't valid.
  372. /// Both \p DemandedLHS and \p DemandedRHS are initialised to [SrcWidth].
  373. /// \p AllowUndefElts permits "-1" indices to be treated as undef.
  374. bool getShuffleDemandedElts(int SrcWidth, ArrayRef<int> Mask,
  375.                             const APInt &DemandedElts, APInt &DemandedLHS,
  376.                             APInt &DemandedRHS, bool AllowUndefElts = false);
  377.  
  378. /// Replace each shuffle mask index with the scaled sequential indices for an
  379. /// equivalent mask of narrowed elements. Mask elements that are less than 0
  380. /// (sentinel values) are repeated in the output mask.
  381. ///
  382. /// Example with Scale = 4:
  383. ///   <4 x i32> <3, 2, 0, -1> -->
  384. ///   <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1>
  385. ///
  386. /// This is the reverse process of widening shuffle mask elements, but it always
  387. /// succeeds because the indexes can always be multiplied (scaled up) to map to
  388. /// narrower vector elements.
  389. void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
  390.                            SmallVectorImpl<int> &ScaledMask);
  391.  
  392. /// Try to transform a shuffle mask by replacing elements with the scaled index
  393. /// for an equivalent mask of widened elements. If all mask elements that would
  394. /// map to a wider element of the new mask are the same negative number
  395. /// (sentinel value), that element of the new mask is the same value. If any
  396. /// element in a given slice is negative and some other element in that slice is
  397. /// not the same value, return false (partial matches with sentinel values are
  398. /// not allowed).
  399. ///
  400. /// Example with Scale = 4:
  401. ///   <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> -->
  402. ///   <4 x i32> <3, 2, 0, -1>
  403. ///
  404. /// This is the reverse process of narrowing shuffle mask elements if it
  405. /// succeeds. This transform is not always possible because indexes may not
  406. /// divide evenly (scale down) to map to wider vector elements.
  407. bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
  408.                           SmallVectorImpl<int> &ScaledMask);
  409.  
  410. /// Repetitively apply `widenShuffleMaskElts()` for as long as it succeeds,
  411. /// to get the shuffle mask with widest possible elements.
  412. void getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
  413.                                   SmallVectorImpl<int> &ScaledMask);
  414.  
  415. /// Splits and processes shuffle mask depending on the number of input and
  416. /// output registers. The function does 2 main things: 1) splits the
  417. /// source/destination vectors into real registers; 2) do the mask analysis to
  418. /// identify which real registers are permuted. Then the function processes
  419. /// resulting registers mask using provided action items. If no input register
  420. /// is defined, \p NoInputAction action is used. If only 1 input register is
  421. /// used, \p SingleInputAction is used, otherwise \p ManyInputsAction is used to
  422. /// process > 2 input registers and masks.
  423. /// \param Mask Original shuffle mask.
  424. /// \param NumOfSrcRegs Number of source registers.
  425. /// \param NumOfDestRegs Number of destination registers.
  426. /// \param NumOfUsedRegs Number of actually used destination registers.
  427. void processShuffleMasks(
  428.     ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
  429.     unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
  430.     function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
  431.     function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
  432.  
  433. /// Compute a map of integer instructions to their minimum legal type
  434. /// size.
  435. ///
  436. /// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int
  437. /// type (e.g. i32) whenever arithmetic is performed on them.
  438. ///
  439. /// For targets with native i8 or i16 operations, usually InstCombine can shrink
  440. /// the arithmetic type down again. However InstCombine refuses to create
  441. /// illegal types, so for targets without i8 or i16 registers, the lengthening
  442. /// and shrinking remains.
  443. ///
  444. /// Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when
  445. /// their scalar equivalents do not, so during vectorization it is important to
  446. /// remove these lengthens and truncates when deciding the profitability of
  447. /// vectorization.
  448. ///
  449. /// This function analyzes the given range of instructions and determines the
  450. /// minimum type size each can be converted to. It attempts to remove or
  451. /// minimize type size changes across each def-use chain, so for example in the
  452. /// following code:
  453. ///
  454. ///   %1 = load i8, i8*
  455. ///   %2 = add i8 %1, 2
  456. ///   %3 = load i16, i16*
  457. ///   %4 = zext i8 %2 to i32
  458. ///   %5 = zext i16 %3 to i32
  459. ///   %6 = add i32 %4, %5
  460. ///   %7 = trunc i32 %6 to i16
  461. ///
  462. /// Instruction %6 must be done at least in i16, so computeMinimumValueSizes
  463. /// will return: {%1: 16, %2: 16, %3: 16, %4: 16, %5: 16, %6: 16, %7: 16}.
  464. ///
  465. /// If the optional TargetTransformInfo is provided, this function tries harder
  466. /// to do less work by only looking at illegal types.
  467. MapVector<Instruction*, uint64_t>
  468. computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
  469.                          DemandedBits &DB,
  470.                          const TargetTransformInfo *TTI=nullptr);
  471.  
  472. /// Compute the union of two access-group lists.
  473. ///
  474. /// If the list contains just one access group, it is returned directly. If the
  475. /// list is empty, returns nullptr.
  476. MDNode *uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2);
  477.  
  478. /// Compute the access-group list of access groups that @p Inst1 and @p Inst2
  479. /// are both in. If either instruction does not access memory at all, it is
  480. /// considered to be in every list.
  481. ///
  482. /// If the list contains just one access group, it is returned directly. If the
  483. /// list is empty, returns nullptr.
  484. MDNode *intersectAccessGroups(const Instruction *Inst1,
  485.                               const Instruction *Inst2);
  486.  
  487. /// Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath,
  488. /// MD_nontemporal, MD_access_group].
  489. /// For K in Kinds, we get the MDNode for K from each of the
  490. /// elements of VL, compute their "intersection" (i.e., the most generic
  491. /// metadata value that covers all of the individual values), and set I's
  492. /// metadata for M equal to the intersection value.
  493. ///
  494. /// This function always sets a (possibly null) value for each K in Kinds.
  495. Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
  496.  
  497. /// Create a mask that filters the members of an interleave group where there
  498. /// are gaps.
  499. ///
  500. /// For example, the mask for \p Group with interleave-factor 3
  501. /// and \p VF 4, that has only its first member present is:
  502. ///
  503. ///   <1,0,0,1,0,0,1,0,0,1,0,0>
  504. ///
  505. /// Note: The result is a mask of 0's and 1's, as opposed to the other
  506. /// create[*]Mask() utilities which create a shuffle mask (mask that
  507. /// consists of indices).
  508. Constant *createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
  509.                                const InterleaveGroup<Instruction> &Group);
  510.  
  511. /// Create a mask with replicated elements.
  512. ///
  513. /// This function creates a shuffle mask for replicating each of the \p VF
  514. /// elements in a vector \p ReplicationFactor times. It can be used to
  515. /// transform a mask of \p VF elements into a mask of
  516. /// \p VF * \p ReplicationFactor elements used by a predicated
  517. /// interleaved-group of loads/stores whose Interleaved-factor ==
  518. /// \p ReplicationFactor.
  519. ///
  520. /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
  521. ///
  522. ///   <0,0,0,1,1,1,2,2,2,3,3,3>
  523. llvm::SmallVector<int, 16> createReplicatedMask(unsigned ReplicationFactor,
  524.                                                 unsigned VF);
  525.  
  526. /// Create an interleave shuffle mask.
  527. ///
  528. /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
  529. /// vectorization factor \p VF into a single wide vector. The mask is of the
  530. /// form:
  531. ///
  532. ///   <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...>
  533. ///
  534. /// For example, the mask for VF = 4 and NumVecs = 2 is:
  535. ///
  536. ///   <0, 4, 1, 5, 2, 6, 3, 7>.
  537. llvm::SmallVector<int, 16> createInterleaveMask(unsigned VF, unsigned NumVecs);
  538.  
  539. /// Create a stride shuffle mask.
  540. ///
  541. /// This function creates a shuffle mask whose elements begin at \p Start and
  542. /// are incremented by \p Stride. The mask can be used to deinterleave an
  543. /// interleaved vector into separate vectors of vectorization factor \p VF. The
  544. /// mask is of the form:
  545. ///
  546. ///   <Start, Start + Stride, ..., Start + Stride * (VF - 1)>
  547. ///
  548. /// For example, the mask for Start = 0, Stride = 2, and VF = 4 is:
  549. ///
  550. ///   <0, 2, 4, 6>
  551. llvm::SmallVector<int, 16> createStrideMask(unsigned Start, unsigned Stride,
  552.                                             unsigned VF);
  553.  
  554. /// Create a sequential shuffle mask.
  555. ///
  556. /// This function creates shuffle mask whose elements are sequential and begin
  557. /// at \p Start.  The mask contains \p NumInts integers and is padded with \p
  558. /// NumUndefs undef values. The mask is of the form:
  559. ///
  560. ///   <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs>
  561. ///
  562. /// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is:
  563. ///
  564. ///   <0, 1, 2, 3, undef, undef, undef, undef>
  565. llvm::SmallVector<int, 16>
  566. createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs);
  567.  
  568. /// Given a shuffle mask for a binary shuffle, create the equivalent shuffle
  569. /// mask assuming both operands are identical. This assumes that the unary
  570. /// shuffle will use elements from operand 0 (operand 1 will be unused).
  571. llvm::SmallVector<int, 16> createUnaryMask(ArrayRef<int> Mask,
  572.                                            unsigned NumElts);
  573.  
  574. /// Concatenate a list of vectors.
  575. ///
  576. /// This function generates code that concatenate the vectors in \p Vecs into a
  577. /// single large vector. The number of vectors should be greater than one, and
  578. /// their element types should be the same. The number of elements in the
  579. /// vectors should also be the same; however, if the last vector has fewer
  580. /// elements, it will be padded with undefs.
  581. Value *concatenateVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vecs);
  582.  
  583. /// Given a mask vector of i1, Return true if all of the elements of this
  584. /// predicate mask are known to be false or undef.  That is, return true if all
  585. /// lanes can be assumed inactive.
  586. bool maskIsAllZeroOrUndef(Value *Mask);
  587.  
  588. /// Given a mask vector of i1, Return true if all of the elements of this
  589. /// predicate mask are known to be true or undef.  That is, return true if all
  590. /// lanes can be assumed active.
  591. bool maskIsAllOneOrUndef(Value *Mask);
  592.  
  593. /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
  594. /// for each lane which may be active.
  595. APInt possiblyDemandedEltsInMask(Value *Mask);
  596.  
  597. /// The group of interleaved loads/stores sharing the same stride and
  598. /// close to each other.
  599. ///
  600. /// Each member in this group has an index starting from 0, and the largest
  601. /// index should be less than interleaved factor, which is equal to the absolute
  602. /// value of the access's stride.
  603. ///
  604. /// E.g. An interleaved load group of factor 4:
  605. ///        for (unsigned i = 0; i < 1024; i+=4) {
  606. ///          a = A[i];                           // Member of index 0
  607. ///          b = A[i+1];                         // Member of index 1
  608. ///          d = A[i+3];                         // Member of index 3
  609. ///          ...
  610. ///        }
  611. ///
  612. ///      An interleaved store group of factor 4:
  613. ///        for (unsigned i = 0; i < 1024; i+=4) {
  614. ///          ...
  615. ///          A[i]   = a;                         // Member of index 0
  616. ///          A[i+1] = b;                         // Member of index 1
  617. ///          A[i+2] = c;                         // Member of index 2
  618. ///          A[i+3] = d;                         // Member of index 3
  619. ///        }
  620. ///
  621. /// Note: the interleaved load group could have gaps (missing members), but
  622. /// the interleaved store group doesn't allow gaps.
  623. template <typename InstTy> class InterleaveGroup {
  624. public:
  625.   InterleaveGroup(uint32_t Factor, bool Reverse, Align Alignment)
  626.       : Factor(Factor), Reverse(Reverse), Alignment(Alignment),
  627.         InsertPos(nullptr) {}
  628.  
  629.   InterleaveGroup(InstTy *Instr, int32_t Stride, Align Alignment)
  630.       : Alignment(Alignment), InsertPos(Instr) {
  631.     Factor = std::abs(Stride);
  632.     assert(Factor > 1 && "Invalid interleave factor");
  633.  
  634.     Reverse = Stride < 0;
  635.     Members[0] = Instr;
  636.   }
  637.  
  638.   bool isReverse() const { return Reverse; }
  639.   uint32_t getFactor() const { return Factor; }
  640.   Align getAlign() const { return Alignment; }
  641.   uint32_t getNumMembers() const { return Members.size(); }
  642.  
  643.   /// Try to insert a new member \p Instr with index \p Index and
  644.   /// alignment \p NewAlign. The index is related to the leader and it could be
  645.   /// negative if it is the new leader.
  646.   ///
  647.   /// \returns false if the instruction doesn't belong to the group.
  648.   bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign) {
  649.     // Make sure the key fits in an int32_t.
  650.     std::optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey);
  651.     if (!MaybeKey)
  652.       return false;
  653.     int32_t Key = *MaybeKey;
  654.  
  655.     // Skip if the key is used for either the tombstone or empty special values.
  656.     if (DenseMapInfo<int32_t>::getTombstoneKey() == Key ||
  657.         DenseMapInfo<int32_t>::getEmptyKey() == Key)
  658.       return false;
  659.  
  660.     // Skip if there is already a member with the same index.
  661.     if (Members.find(Key) != Members.end())
  662.       return false;
  663.  
  664.     if (Key > LargestKey) {
  665.       // The largest index is always less than the interleave factor.
  666.       if (Index >= static_cast<int32_t>(Factor))
  667.         return false;
  668.  
  669.       LargestKey = Key;
  670.     } else if (Key < SmallestKey) {
  671.  
  672.       // Make sure the largest index fits in an int32_t.
  673.       std::optional<int32_t> MaybeLargestIndex = checkedSub(LargestKey, Key);
  674.       if (!MaybeLargestIndex)
  675.         return false;
  676.  
  677.       // The largest index is always less than the interleave factor.
  678.       if (*MaybeLargestIndex >= static_cast<int64_t>(Factor))
  679.         return false;
  680.  
  681.       SmallestKey = Key;
  682.     }
  683.  
  684.     // It's always safe to select the minimum alignment.
  685.     Alignment = std::min(Alignment, NewAlign);
  686.     Members[Key] = Instr;
  687.     return true;
  688.   }
  689.  
  690.   /// Get the member with the given index \p Index
  691.   ///
  692.   /// \returns nullptr if contains no such member.
  693.   InstTy *getMember(uint32_t Index) const {
  694.     int32_t Key = SmallestKey + Index;
  695.     return Members.lookup(Key);
  696.   }
  697.  
  698.   /// Get the index for the given member. Unlike the key in the member
  699.   /// map, the index starts from 0.
  700.   uint32_t getIndex(const InstTy *Instr) const {
  701.     for (auto I : Members) {
  702.       if (I.second == Instr)
  703.         return I.first - SmallestKey;
  704.     }
  705.  
  706.     llvm_unreachable("InterleaveGroup contains no such member");
  707.   }
  708.  
  709.   InstTy *getInsertPos() const { return InsertPos; }
  710.   void setInsertPos(InstTy *Inst) { InsertPos = Inst; }
  711.  
  712.   /// Add metadata (e.g. alias info) from the instructions in this group to \p
  713.   /// NewInst.
  714.   ///
  715.   /// FIXME: this function currently does not add noalias metadata a'la
  716.   /// addNewMedata.  To do that we need to compute the intersection of the
  717.   /// noalias info from all members.
  718.   void addMetadata(InstTy *NewInst) const;
  719.  
  720.   /// Returns true if this Group requires a scalar iteration to handle gaps.
  721.   bool requiresScalarEpilogue() const {
  722.     // If the last member of the Group exists, then a scalar epilog is not
  723.     // needed for this group.
  724.     if (getMember(getFactor() - 1))
  725.       return false;
  726.  
  727.     // We have a group with gaps. It therefore can't be a reversed access,
  728.     // because such groups get invalidated (TODO).
  729.     assert(!isReverse() && "Group should have been invalidated");
  730.  
  731.     // This is a group of loads, with gaps, and without a last-member
  732.     return true;
  733.   }
  734.  
  735. private:
  736.   uint32_t Factor; // Interleave Factor.
  737.   bool Reverse;
  738.   Align Alignment;
  739.   DenseMap<int32_t, InstTy *> Members;
  740.   int32_t SmallestKey = 0;
  741.   int32_t LargestKey = 0;
  742.  
  743.   // To avoid breaking dependences, vectorized instructions of an interleave
  744.   // group should be inserted at either the first load or the last store in
  745.   // program order.
  746.   //
  747.   // E.g. %even = load i32             // Insert Position
  748.   //      %add = add i32 %even         // Use of %even
  749.   //      %odd = load i32
  750.   //
  751.   //      store i32 %even
  752.   //      %odd = add i32               // Def of %odd
  753.   //      store i32 %odd               // Insert Position
  754.   InstTy *InsertPos;
  755. };
  756.  
  757. /// Drive the analysis of interleaved memory accesses in the loop.
  758. ///
  759. /// Use this class to analyze interleaved accesses only when we can vectorize
  760. /// a loop. Otherwise it's meaningless to do analysis as the vectorization
  761. /// on interleaved accesses is unsafe.
  762. ///
  763. /// The analysis collects interleave groups and records the relationships
  764. /// between the member and the group in a map.
  765. class InterleavedAccessInfo {
  766. public:
  767.   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
  768.                         DominatorTree *DT, LoopInfo *LI,
  769.                         const LoopAccessInfo *LAI)
  770.       : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
  771.  
  772.   ~InterleavedAccessInfo() { invalidateGroups(); }
  773.  
  774.   /// Analyze the interleaved accesses and collect them in interleave
  775.   /// groups. Substitute symbolic strides using \p Strides.
  776.   /// Consider also predicated loads/stores in the analysis if
  777.   /// \p EnableMaskedInterleavedGroup is true.
  778.   void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
  779.  
  780.   /// Invalidate groups, e.g., in case all blocks in loop will be predicated
  781.   /// contrary to original assumption. Although we currently prevent group
  782.   /// formation for predicated accesses, we may be able to relax this limitation
  783.   /// in the future once we handle more complicated blocks. Returns true if any
  784.   /// groups were invalidated.
  785.   bool invalidateGroups() {
  786.     if (InterleaveGroups.empty()) {
  787.       assert(
  788.           !RequiresScalarEpilogue &&
  789.           "RequiresScalarEpilog should not be set without interleave groups");
  790.       return false;
  791.     }
  792.  
  793.     InterleaveGroupMap.clear();
  794.     for (auto *Ptr : InterleaveGroups)
  795.       delete Ptr;
  796.     InterleaveGroups.clear();
  797.     RequiresScalarEpilogue = false;
  798.     return true;
  799.   }
  800.  
  801.   /// Check if \p Instr belongs to any interleave group.
  802.   bool isInterleaved(Instruction *Instr) const {
  803.     return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end();
  804.   }
  805.  
  806.   /// Get the interleave group that \p Instr belongs to.
  807.   ///
  808.   /// \returns nullptr if doesn't have such group.
  809.   InterleaveGroup<Instruction> *
  810.   getInterleaveGroup(const Instruction *Instr) const {
  811.     return InterleaveGroupMap.lookup(Instr);
  812.   }
  813.  
  814.   iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>>
  815.   getInterleaveGroups() {
  816.     return make_range(InterleaveGroups.begin(), InterleaveGroups.end());
  817.   }
  818.  
  819.   /// Returns true if an interleaved group that may access memory
  820.   /// out-of-bounds requires a scalar epilogue iteration for correctness.
  821.   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
  822.  
  823.   /// Invalidate groups that require a scalar epilogue (due to gaps). This can
  824.   /// happen when optimizing for size forbids a scalar epilogue, and the gap
  825.   /// cannot be filtered by masking the load/store.
  826.   void invalidateGroupsRequiringScalarEpilogue();
  827.  
  828.   /// Returns true if we have any interleave groups.
  829.   bool hasGroups() const { return !InterleaveGroups.empty(); }
  830.  
  831. private:
  832.   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
  833.   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
  834.   /// The interleaved access analysis can also add new predicates (for example
  835.   /// by versioning strides of pointers).
  836.   PredicatedScalarEvolution &PSE;
  837.  
  838.   Loop *TheLoop;
  839.   DominatorTree *DT;
  840.   LoopInfo *LI;
  841.   const LoopAccessInfo *LAI;
  842.  
  843.   /// True if the loop may contain non-reversed interleaved groups with
  844.   /// out-of-bounds accesses. We ensure we don't speculatively access memory
  845.   /// out-of-bounds by executing at least one scalar epilogue iteration.
  846.   bool RequiresScalarEpilogue = false;
  847.  
  848.   /// Holds the relationships between the members and the interleave group.
  849.   DenseMap<Instruction *, InterleaveGroup<Instruction> *> InterleaveGroupMap;
  850.  
  851.   SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups;
  852.  
  853.   /// Holds dependences among the memory accesses in the loop. It maps a source
  854.   /// access to a set of dependent sink accesses.
  855.   DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
  856.  
  857.   /// The descriptor for a strided memory access.
  858.   struct StrideDescriptor {
  859.     StrideDescriptor() = default;
  860.     StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
  861.                      Align Alignment)
  862.         : Stride(Stride), Scev(Scev), Size(Size), Alignment(Alignment) {}
  863.  
  864.     // The access's stride. It is negative for a reverse access.
  865.     int64_t Stride = 0;
  866.  
  867.     // The scalar expression of this access.
  868.     const SCEV *Scev = nullptr;
  869.  
  870.     // The size of the memory object.
  871.     uint64_t Size = 0;
  872.  
  873.     // The alignment of this access.
  874.     Align Alignment;
  875.   };
  876.  
  877.   /// A type for holding instructions and their stride descriptors.
  878.   using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
  879.  
  880.   /// Create a new interleave group with the given instruction \p Instr,
  881.   /// stride \p Stride and alignment \p Align.
  882.   ///
  883.   /// \returns the newly created interleave group.
  884.   InterleaveGroup<Instruction> *
  885.   createInterleaveGroup(Instruction *Instr, int Stride, Align Alignment) {
  886.     assert(!InterleaveGroupMap.count(Instr) &&
  887.            "Already in an interleaved access group");
  888.     InterleaveGroupMap[Instr] =
  889.         new InterleaveGroup<Instruction>(Instr, Stride, Alignment);
  890.     InterleaveGroups.insert(InterleaveGroupMap[Instr]);
  891.     return InterleaveGroupMap[Instr];
  892.   }
  893.  
  894.   /// Release the group and remove all the relationships.
  895.   void releaseGroup(InterleaveGroup<Instruction> *Group) {
  896.     for (unsigned i = 0; i < Group->getFactor(); i++)
  897.       if (Instruction *Member = Group->getMember(i))
  898.         InterleaveGroupMap.erase(Member);
  899.  
  900.     InterleaveGroups.erase(Group);
  901.     delete Group;
  902.   }
  903.  
  904.   /// Collect all the accesses with a constant stride in program order.
  905.   void collectConstStrideAccesses(
  906.       MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
  907.       const ValueToValueMap &Strides);
  908.  
  909.   /// Returns true if \p Stride is allowed in an interleaved group.
  910.   static bool isStrided(int Stride);
  911.  
  912.   /// Returns true if \p BB is a predicated block.
  913.   bool isPredicated(BasicBlock *BB) const {
  914.     return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
  915.   }
  916.  
  917.   /// Returns true if LoopAccessInfo can be used for dependence queries.
  918.   bool areDependencesValid() const {
  919.     return LAI && LAI->getDepChecker().getDependences();
  920.   }
  921.  
  922.   /// Returns true if memory accesses \p A and \p B can be reordered, if
  923.   /// necessary, when constructing interleaved groups.
  924.   ///
  925.   /// \p A must precede \p B in program order. We return false if reordering is
  926.   /// not necessary or is prevented because \p A and \p B may be dependent.
  927.   bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
  928.                                                  StrideEntry *B) const {
  929.     // Code motion for interleaved accesses can potentially hoist strided loads
  930.     // and sink strided stores. The code below checks the legality of the
  931.     // following two conditions:
  932.     //
  933.     // 1. Potentially moving a strided load (B) before any store (A) that
  934.     //    precedes B, or
  935.     //
  936.     // 2. Potentially moving a strided store (A) after any load or store (B)
  937.     //    that A precedes.
  938.     //
  939.     // It's legal to reorder A and B if we know there isn't a dependence from A
  940.     // to B. Note that this determination is conservative since some
  941.     // dependences could potentially be reordered safely.
  942.  
  943.     // A is potentially the source of a dependence.
  944.     auto *Src = A->first;
  945.     auto SrcDes = A->second;
  946.  
  947.     // B is potentially the sink of a dependence.
  948.     auto *Sink = B->first;
  949.     auto SinkDes = B->second;
  950.  
  951.     // Code motion for interleaved accesses can't violate WAR dependences.
  952.     // Thus, reordering is legal if the source isn't a write.
  953.     if (!Src->mayWriteToMemory())
  954.       return true;
  955.  
  956.     // At least one of the accesses must be strided.
  957.     if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
  958.       return true;
  959.  
  960.     // If dependence information is not available from LoopAccessInfo,
  961.     // conservatively assume the instructions can't be reordered.
  962.     if (!areDependencesValid())
  963.       return false;
  964.  
  965.     // If we know there is a dependence from source to sink, assume the
  966.     // instructions can't be reordered. Otherwise, reordering is legal.
  967.     return Dependences.find(Src) == Dependences.end() ||
  968.            !Dependences.lookup(Src).count(Sink);
  969.   }
  970.  
  971.   /// Collect the dependences from LoopAccessInfo.
  972.   ///
  973.   /// We process the dependences once during the interleaved access analysis to
  974.   /// enable constant-time dependence queries.
  975.   void collectDependences() {
  976.     if (!areDependencesValid())
  977.       return;
  978.     auto *Deps = LAI->getDepChecker().getDependences();
  979.     for (auto Dep : *Deps)
  980.       Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
  981.   }
  982. };
  983.  
  984. } // llvm namespace
  985.  
  986. #endif
  987.