Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===- llvm/Analysis/VectorUtils.h - Vector utilities -----------*- C++ -*-===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | // | ||
| 9 | // This file defines some vectorizer utilities. | ||
| 10 | // | ||
| 11 | //===----------------------------------------------------------------------===// | ||
| 12 | |||
| 13 | #ifndef LLVM_ANALYSIS_VECTORUTILS_H | ||
| 14 | #define LLVM_ANALYSIS_VECTORUTILS_H | ||
| 15 | |||
| 16 | #include "llvm/ADT/MapVector.h" | ||
| 17 | #include "llvm/ADT/SmallVector.h" | ||
| 18 | #include "llvm/Analysis/LoopAccessAnalysis.h" | ||
| 19 | #include "llvm/Support/CheckedArithmetic.h" | ||
| 20 | |||
| 21 | namespace llvm { | ||
| 22 | class TargetLibraryInfo; | ||
| 23 | |||
| 24 | /// Describes the type of Parameters | ||
| 25 | enum class VFParamKind { | ||
| 26 |   Vector,            // No semantic information. | ||
| 27 |   OMP_Linear,        // declare simd linear(i) | ||
| 28 |   OMP_LinearRef,     // declare simd linear(ref(i)) | ||
| 29 |   OMP_LinearVal,     // declare simd linear(val(i)) | ||
| 30 |   OMP_LinearUVal,    // declare simd linear(uval(i)) | ||
| 31 |   OMP_LinearPos,     // declare simd linear(i:c) uniform(c) | ||
| 32 |   OMP_LinearValPos,  // declare simd linear(val(i:c)) uniform(c) | ||
| 33 |   OMP_LinearRefPos,  // declare simd linear(ref(i:c)) uniform(c) | ||
| 34 |   OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c) | ||
| 35 |   OMP_Uniform,       // declare simd uniform(i) | ||
| 36 |   GlobalPredicate,   // Global logical predicate that acts on all lanes | ||
| 37 |                      // of the input and output mask concurrently. For | ||
| 38 |                      // example, it is implied by the `M` token in the | ||
| 39 |                      // Vector Function ABI mangled name. | ||
| 40 | Unknown | ||
| 41 | }; | ||
| 42 | |||
| 43 | /// Describes the type of Instruction Set Architecture | ||
| 44 | enum class VFISAKind { | ||
| 45 |   AdvancedSIMD, // AArch64 Advanced SIMD (NEON) | ||
| 46 |   SVE,          // AArch64 Scalable Vector Extension | ||
| 47 |   SSE,          // x86 SSE | ||
| 48 |   AVX,          // x86 AVX | ||
| 49 |   AVX2,         // x86 AVX2 | ||
| 50 |   AVX512,       // x86 AVX512 | ||
| 51 |   LLVM,         // LLVM internal ISA for functions that are not | ||
| 52 |   // attached to an existing ABI via name mangling. | ||
| 53 |   Unknown // Unknown ISA | ||
| 54 | }; | ||
| 55 | |||
| 56 | /// Encapsulates information needed to describe a parameter. | ||
| 57 | /// | ||
| 58 | /// The description of the parameter is not linked directly to | ||
| 59 | /// OpenMP or any other vector function description. This structure | ||
| 60 | /// is extendible to handle other paradigms that describe vector | ||
| 61 | /// functions and their parameters. | ||
| 62 | struct VFParameter { | ||
| 63 | unsigned ParamPos; // Parameter Position in Scalar Function. | ||
| 64 | VFParamKind ParamKind; // Kind of Parameter. | ||
| 65 | int LinearStepOrPos = 0; // Step or Position of the Parameter. | ||
| 66 | Align Alignment = Align(); // Optional alignment in bytes, defaulted to 1. | ||
| 67 | |||
| 68 |   // Comparison operator. | ||
| 69 | bool operator==(const VFParameter &Other) const { | ||
| 70 | return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) == | ||
| 71 | std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos, | ||
| 72 | Other.Alignment); | ||
| 73 |   } | ||
| 74 | }; | ||
| 75 | |||
| 76 | /// Contains the information about the kind of vectorization | ||
| 77 | /// available. | ||
| 78 | /// | ||
| 79 | /// This object in independent on the paradigm used to | ||
| 80 | /// represent vector functions. in particular, it is not attached to | ||
| 81 | /// any target-specific ABI. | ||
| 82 | struct VFShape { | ||
| 83 | ElementCount VF; // Vectorization factor. | ||
| 84 | SmallVector<VFParameter, 8> Parameters; // List of parameter information. | ||
| 85 |   // Comparison operator. | ||
| 86 | bool operator==(const VFShape &Other) const { | ||
| 87 | return std::tie(VF, Parameters) == std::tie(Other.VF, Other.Parameters); | ||
| 88 |   } | ||
| 89 | |||
| 90 |   /// Update the parameter in position P.ParamPos to P. | ||
| 91 | void updateParam(VFParameter P) { | ||
| 92 | assert(P.ParamPos < Parameters.size() && "Invalid parameter position."); | ||
| 93 | Parameters[P.ParamPos] = P; | ||
| 94 | assert(hasValidParameterList() && "Invalid parameter list"); | ||
| 95 |   } | ||
| 96 | |||
| 97 |   // Retrieve the VFShape that can be used to map a (scalar) function to itself, | ||
| 98 |   // with VF = 1. | ||
| 99 | static VFShape getScalarShape(const CallInst &CI) { | ||
| 100 | return VFShape::get(CI, ElementCount::getFixed(1), | ||
| 101 | /*HasGlobalPredicate*/ false); | ||
| 102 |   } | ||
| 103 | |||
| 104 |   // Retrieve the basic vectorization shape of the function, where all | ||
| 105 |   // parameters are mapped to VFParamKind::Vector with \p EC | ||
| 106 |   // lanes. Specifies whether the function has a Global Predicate | ||
| 107 |   // argument via \p HasGlobalPred. | ||
| 108 | static VFShape get(const CallInst &CI, ElementCount EC, bool HasGlobalPred) { | ||
| 109 | SmallVector<VFParameter, 8> Parameters; | ||
| 110 | for (unsigned I = 0; I < CI.arg_size(); ++I) | ||
| 111 | Parameters.push_back(VFParameter({I, VFParamKind::Vector})); | ||
| 112 | if (HasGlobalPred) | ||
| 113 | Parameters.push_back( | ||
| 114 | VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate})); | ||
| 115 | |||
| 116 | return {EC, Parameters}; | ||
| 117 |   } | ||
| 118 |   /// Validation check on the Parameters in the VFShape. | ||
| 119 | bool hasValidParameterList() const; | ||
| 120 | }; | ||
| 121 | |||
| 122 | /// Holds the VFShape for a specific scalar to vector function mapping. | ||
| 123 | struct VFInfo { | ||
| 124 | VFShape Shape; /// Classification of the vector function. | ||
| 125 | std::string ScalarName; /// Scalar Function Name. | ||
| 126 | std::string VectorName; /// Vector Function Name associated to this VFInfo. | ||
| 127 | VFISAKind ISA; /// Instruction Set Architecture. | ||
| 128 | }; | ||
| 129 | |||
| 130 | namespace VFABI { | ||
| 131 | /// LLVM Internal VFABI ISA token for vector functions. | ||
| 132 | static constexpr char const *_LLVM_ = "_LLVM_"; | ||
| 133 | /// Prefix for internal name redirection for vector function that | ||
| 134 | /// tells the compiler to scalarize the call using the scalar name | ||
| 135 | /// of the function. For example, a mangled name like | ||
| 136 | /// `_ZGV_LLVM_N2v_foo(_LLVM_Scalarize_foo)` would tell the | ||
| 137 | /// vectorizer to vectorize the scalar call `foo`, and to scalarize | ||
| 138 | /// it once vectorization is done. | ||
| 139 | static constexpr char const *_LLVM_Scalarize_ = "_LLVM_Scalarize_"; | ||
| 140 | |||
| 141 | /// Function to construct a VFInfo out of a mangled names in the | ||
| 142 | /// following format: | ||
| 143 | /// | ||
| 144 | /// <VFABI_name>{(<redirection>)} | ||
| 145 | /// | ||
| 146 | /// where <VFABI_name> is the name of the vector function, mangled according | ||
| 147 | /// to the rules described in the Vector Function ABI of the target vector | ||
| 148 | /// extension (or <isa> from now on). The <VFABI_name> is in the following | ||
| 149 | /// format: | ||
| 150 | /// | ||
| 151 | /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)] | ||
| 152 | /// | ||
| 153 | /// This methods support demangling rules for the following <isa>: | ||
| 154 | /// | ||
| 155 | /// * AArch64: https://developer.arm.com/docs/101129/latest | ||
| 156 | /// | ||
| 157 | /// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and | ||
| 158 | ///  https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt | ||
| 159 | /// | ||
| 160 | /// \param MangledName -> input string in the format | ||
| 161 | /// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]. | ||
| 162 | /// \param M -> Module used to retrieve informations about the vector | ||
| 163 | /// function that are not possible to retrieve from the mangled | ||
| 164 | /// name. At the moment, this parameter is needed only to retrieve the | ||
| 165 | /// Vectorization Factor of scalable vector functions from their | ||
| 166 | /// respective IR declarations. | ||
| 167 | std::optional<VFInfo> tryDemangleForVFABI(StringRef MangledName, | ||
| 168 | const Module &M); | ||
| 169 | |||
| 170 | /// This routine mangles the given VectorName according to the LangRef | ||
| 171 | /// specification for vector-function-abi-variant attribute and is specific to | ||
| 172 | /// the TLI mappings. It is the responsibility of the caller to make sure that | ||
| 173 | /// this is only used if all parameters in the vector function are vector type. | ||
| 174 | /// This returned string holds scalar-to-vector mapping: | ||
| 175 | ///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>) | ||
| 176 | /// | ||
| 177 | /// where: | ||
| 178 | /// | ||
| 179 | /// <isa> = "_LLVM_" | ||
| 180 | /// <mask> = "N". Note: TLI does not support masked interfaces. | ||
| 181 | /// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor` | ||
| 182 | ///          field of the `VecDesc` struct. If the number of lanes is scalable | ||
| 183 | ///          then 'x' is printed instead. | ||
| 184 | /// <vparams> = "v", as many as are the numArgs. | ||
| 185 | /// <scalarname> = the name of the scalar function. | ||
| 186 | /// <vectorname> = the name of the vector function. | ||
| 187 | std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName, | ||
| 188 | unsigned numArgs, ElementCount VF); | ||
| 189 | |||
| 190 | /// Retrieve the `VFParamKind` from a string token. | ||
| 191 | VFParamKind getVFParamKindFromString(const StringRef Token); | ||
| 192 | |||
| 193 | // Name of the attribute where the variant mappings are stored. | ||
| 194 | static constexpr char const *MappingsAttrName = "vector-function-abi-variant"; | ||
| 195 | |||
| 196 | /// Populates a set of strings representing the Vector Function ABI variants | ||
| 197 | /// associated to the CallInst CI. If the CI does not contain the | ||
| 198 | /// vector-function-abi-variant attribute, we return without populating | ||
| 199 | /// VariantMappings, i.e. callers of getVectorVariantNames need not check for | ||
| 200 | /// the presence of the attribute (see InjectTLIMappings). | ||
| 201 | void getVectorVariantNames(const CallInst &CI, | ||
| 202 | SmallVectorImpl<std::string> &VariantMappings); | ||
| 203 | } // end namespace VFABI | ||
| 204 | |||
| 205 | /// The Vector Function Database. | ||
| 206 | /// | ||
| 207 | /// Helper class used to find the vector functions associated to a | ||
| 208 | /// scalar CallInst. | ||
| 209 | class VFDatabase { | ||
| 210 |   /// The Module of the CallInst CI. | ||
| 211 | const Module *M; | ||
| 212 |   /// The CallInst instance being queried for scalar to vector mappings. | ||
| 213 | const CallInst &CI; | ||
| 214 |   /// List of vector functions descriptors associated to the call | ||
| 215 |   /// instruction. | ||
| 216 | const SmallVector<VFInfo, 8> ScalarToVectorMappings; | ||
| 217 | |||
| 218 |   /// Retrieve the scalar-to-vector mappings associated to the rule of | ||
| 219 |   /// a vector Function ABI. | ||
| 220 | static void getVFABIMappings(const CallInst &CI, | ||
| 221 | SmallVectorImpl<VFInfo> &Mappings) { | ||
| 222 | if (!CI.getCalledFunction()) | ||
| 223 | return; | ||
| 224 | |||
| 225 | const StringRef ScalarName = CI.getCalledFunction()->getName(); | ||
| 226 | |||
| 227 | SmallVector<std::string, 8> ListOfStrings; | ||
| 228 |     // The check for the vector-function-abi-variant attribute is done when | ||
| 229 |     // retrieving the vector variant names here. | ||
| 230 | VFABI::getVectorVariantNames(CI, ListOfStrings); | ||
| 231 | if (ListOfStrings.empty()) | ||
| 232 | return; | ||
| 233 | for (const auto &MangledName : ListOfStrings) { | ||
| 234 | const std::optional<VFInfo> Shape = | ||
| 235 | VFABI::tryDemangleForVFABI(MangledName, *(CI.getModule())); | ||
| 236 |       // A match is found via scalar and vector names, and also by | ||
| 237 |       // ensuring that the variant described in the attribute has a | ||
| 238 |       // corresponding definition or declaration of the vector | ||
| 239 |       // function in the Module M. | ||
| 240 | if (Shape && (Shape->ScalarName == ScalarName)) { | ||
| 241 | assert(CI.getModule()->getFunction(Shape->VectorName) && | ||
| 242 | "Vector function is missing."); | ||
| 243 | Mappings.push_back(*Shape); | ||
| 244 |       } | ||
| 245 |     } | ||
| 246 |   } | ||
| 247 | |||
| 248 | public: | ||
| 249 |   /// Retrieve all the VFInfo instances associated to the CallInst CI. | ||
| 250 | static SmallVector<VFInfo, 8> getMappings(const CallInst &CI) { | ||
| 251 | SmallVector<VFInfo, 8> Ret; | ||
| 252 | |||
| 253 |     // Get mappings from the Vector Function ABI variants. | ||
| 254 | getVFABIMappings(CI, Ret); | ||
| 255 | |||
| 256 |     // Other non-VFABI variants should be retrieved here. | ||
| 257 | |||
| 258 | return Ret; | ||
| 259 |   } | ||
| 260 | |||
| 261 |   /// Constructor, requires a CallInst instance. | ||
| 262 | VFDatabase(CallInst &CI) | ||
| 263 | : M(CI.getModule()), CI(CI), | ||
| 264 | ScalarToVectorMappings(VFDatabase::getMappings(CI)) {} | ||
| 265 |   /// \defgroup VFDatabase query interface. | ||
| 266 |   /// | ||
| 267 |   /// @{ | ||
| 268 |   /// Retrieve the Function with VFShape \p Shape. | ||
| 269 | Function *getVectorizedFunction(const VFShape &Shape) const { | ||
| 270 | if (Shape == VFShape::getScalarShape(CI)) | ||
| 271 | return CI.getCalledFunction(); | ||
| 272 | |||
| 273 | for (const auto &Info : ScalarToVectorMappings) | ||
| 274 | if (Info.Shape == Shape) | ||
| 275 | return M->getFunction(Info.VectorName); | ||
| 276 | |||
| 277 | return nullptr; | ||
| 278 |   } | ||
| 279 |   /// @} | ||
| 280 | }; | ||
| 281 | |||
| 282 | template <typename T> class ArrayRef; | ||
| 283 | class DemandedBits; | ||
| 284 | class GetElementPtrInst; | ||
| 285 | template <typename InstTy> class InterleaveGroup; | ||
| 286 | class IRBuilderBase; | ||
| 287 | class Loop; | ||
| 288 | class ScalarEvolution; | ||
| 289 | class TargetTransformInfo; | ||
| 290 | class Type; | ||
| 291 | class Value; | ||
| 292 | |||
| 293 | namespace Intrinsic { | ||
| 294 | typedef unsigned ID; | ||
| 295 | } | ||
| 296 | |||
| 297 | /// A helper function for converting Scalar types to vector types. If | ||
| 298 | /// the incoming type is void, we return void. If the EC represents a | ||
| 299 | /// scalar, we return the scalar type. | ||
| 300 | inline Type *ToVectorTy(Type *Scalar, ElementCount EC) { | ||
| 301 | if (Scalar->isVoidTy() || Scalar->isMetadataTy() || EC.isScalar()) | ||
| 302 | return Scalar; | ||
| 303 | return VectorType::get(Scalar, EC); | ||
| 304 | } | ||
| 305 | |||
| 306 | inline Type *ToVectorTy(Type *Scalar, unsigned VF) { | ||
| 307 | return ToVectorTy(Scalar, ElementCount::getFixed(VF)); | ||
| 308 | } | ||
| 309 | |||
| 310 | /// Identify if the intrinsic is trivially vectorizable. | ||
| 311 | /// This method returns true if the intrinsic's argument types are all scalars | ||
| 312 | /// for the scalar form of the intrinsic and all vectors (or scalars handled by | ||
| 313 | /// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic. | ||
| 314 | bool isTriviallyVectorizable(Intrinsic::ID ID); | ||
| 315 | |||
| 316 | /// Identifies if the vector form of the intrinsic has a scalar operand. | ||
| 317 | bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, | ||
| 318 | unsigned ScalarOpdIdx); | ||
| 319 | |||
| 320 | /// Identifies if the vector form of the intrinsic has a operand that has | ||
| 321 | /// an overloaded type. | ||
| 322 | bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx); | ||
| 323 | |||
| 324 | /// Returns intrinsic ID for call. | ||
| 325 | /// For the input call instruction it finds mapping intrinsic and returns | ||
| 326 | /// its intrinsic ID, in case it does not found it return not_intrinsic. | ||
| 327 | Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, | ||
| 328 | const TargetLibraryInfo *TLI); | ||
| 329 | |||
| 330 | /// Find the operand of the GEP that should be checked for consecutive | ||
| 331 | /// stores. This ignores trailing indices that have no effect on the final | ||
| 332 | /// pointer. | ||
| 333 | unsigned getGEPInductionOperand(const GetElementPtrInst *Gep); | ||
| 334 | |||
| 335 | /// If the argument is a GEP, then returns the operand identified by | ||
| 336 | /// getGEPInductionOperand. However, if there is some other non-loop-invariant | ||
| 337 | /// operand, it returns that instead. | ||
| 338 | Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp); | ||
| 339 | |||
| 340 | /// If a value has only one user that is a CastInst, return it. | ||
| 341 | Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty); | ||
| 342 | |||
| 343 | /// Get the stride of a pointer access in a loop. Looks for symbolic | ||
| 344 | /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. | ||
| 345 | Value *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp); | ||
| 346 | |||
| 347 | /// Given a vector and an element number, see if the scalar value is | ||
| 348 | /// already around as a register, for example if it were inserted then extracted | ||
| 349 | /// from the vector. | ||
| 350 | Value *findScalarElement(Value *V, unsigned EltNo); | ||
| 351 | |||
| 352 | /// If all non-negative \p Mask elements are the same value, return that value. | ||
| 353 | /// If all elements are negative (undefined) or \p Mask contains different | ||
| 354 | /// non-negative values, return -1. | ||
| 355 | int getSplatIndex(ArrayRef<int> Mask); | ||
| 356 | |||
| 357 | /// Get splat value if the input is a splat vector or return nullptr. | ||
| 358 | /// The value may be extracted from a splat constants vector or from | ||
| 359 | /// a sequence of instructions that broadcast a single value into a vector. | ||
| 360 | Value *getSplatValue(const Value *V); | ||
| 361 | |||
| 362 | /// Return true if each element of the vector value \p V is poisoned or equal to | ||
| 363 | /// every other non-poisoned element. If an index element is specified, either | ||
| 364 | /// every element of the vector is poisoned or the element at that index is not | ||
| 365 | /// poisoned and equal to every other non-poisoned element. | ||
| 366 | /// This may be more powerful than the related getSplatValue() because it is | ||
| 367 | /// not limited by finding a scalar source value to a splatted vector. | ||
| 368 | bool isSplatValue(const Value *V, int Index = -1, unsigned Depth = 0); | ||
| 369 | |||
| 370 | /// Transform a shuffle mask's output demanded element mask into demanded | ||
| 371 | /// element masks for the 2 operands, returns false if the mask isn't valid. | ||
| 372 | /// Both \p DemandedLHS and \p DemandedRHS are initialised to [SrcWidth]. | ||
| 373 | /// \p AllowUndefElts permits "-1" indices to be treated as undef. | ||
| 374 | bool getShuffleDemandedElts(int SrcWidth, ArrayRef<int> Mask, | ||
| 375 | const APInt &DemandedElts, APInt &DemandedLHS, | ||
| 376 | APInt &DemandedRHS, bool AllowUndefElts = false); | ||
| 377 | |||
| 378 | /// Replace each shuffle mask index with the scaled sequential indices for an | ||
| 379 | /// equivalent mask of narrowed elements. Mask elements that are less than 0 | ||
| 380 | /// (sentinel values) are repeated in the output mask. | ||
| 381 | /// | ||
| 382 | /// Example with Scale = 4: | ||
| 383 | ///   <4 x i32> <3, 2, 0, -1> --> | ||
| 384 | ///   <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> | ||
| 385 | /// | ||
| 386 | /// This is the reverse process of widening shuffle mask elements, but it always | ||
| 387 | /// succeeds because the indexes can always be multiplied (scaled up) to map to | ||
| 388 | /// narrower vector elements. | ||
| 389 | void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask, | ||
| 390 | SmallVectorImpl<int> &ScaledMask); | ||
| 391 | |||
| 392 | /// Try to transform a shuffle mask by replacing elements with the scaled index | ||
| 393 | /// for an equivalent mask of widened elements. If all mask elements that would | ||
| 394 | /// map to a wider element of the new mask are the same negative number | ||
| 395 | /// (sentinel value), that element of the new mask is the same value. If any | ||
| 396 | /// element in a given slice is negative and some other element in that slice is | ||
| 397 | /// not the same value, return false (partial matches with sentinel values are | ||
| 398 | /// not allowed). | ||
| 399 | /// | ||
| 400 | /// Example with Scale = 4: | ||
| 401 | ///   <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> --> | ||
| 402 | ///   <4 x i32> <3, 2, 0, -1> | ||
| 403 | /// | ||
| 404 | /// This is the reverse process of narrowing shuffle mask elements if it | ||
| 405 | /// succeeds. This transform is not always possible because indexes may not | ||
| 406 | /// divide evenly (scale down) to map to wider vector elements. | ||
| 407 | bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask, | ||
| 408 | SmallVectorImpl<int> &ScaledMask); | ||
| 409 | |||
| 410 | /// Repetitively apply `widenShuffleMaskElts()` for as long as it succeeds, | ||
| 411 | /// to get the shuffle mask with widest possible elements. | ||
| 412 | void getShuffleMaskWithWidestElts(ArrayRef<int> Mask, | ||
| 413 | SmallVectorImpl<int> &ScaledMask); | ||
| 414 | |||
| 415 | /// Splits and processes shuffle mask depending on the number of input and | ||
| 416 | /// output registers. The function does 2 main things: 1) splits the | ||
| 417 | /// source/destination vectors into real registers; 2) do the mask analysis to | ||
| 418 | /// identify which real registers are permuted. Then the function processes | ||
| 419 | /// resulting registers mask using provided action items. If no input register | ||
| 420 | /// is defined, \p NoInputAction action is used. If only 1 input register is | ||
| 421 | /// used, \p SingleInputAction is used, otherwise \p ManyInputsAction is used to | ||
| 422 | /// process > 2 input registers and masks. | ||
| 423 | /// \param Mask Original shuffle mask. | ||
| 424 | /// \param NumOfSrcRegs Number of source registers. | ||
| 425 | /// \param NumOfDestRegs Number of destination registers. | ||
| 426 | /// \param NumOfUsedRegs Number of actually used destination registers. | ||
| 427 | void processShuffleMasks( | ||
| 428 | ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, | ||
| 429 | unsigned NumOfUsedRegs, function_ref<void()> NoInputAction, | ||
| 430 | function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction, | ||
| 431 | function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction); | ||
| 432 | |||
| 433 | /// Compute a map of integer instructions to their minimum legal type | ||
| 434 | /// size. | ||
| 435 | /// | ||
| 436 | /// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int | ||
| 437 | /// type (e.g. i32) whenever arithmetic is performed on them. | ||
| 438 | /// | ||
| 439 | /// For targets with native i8 or i16 operations, usually InstCombine can shrink | ||
| 440 | /// the arithmetic type down again. However InstCombine refuses to create | ||
| 441 | /// illegal types, so for targets without i8 or i16 registers, the lengthening | ||
| 442 | /// and shrinking remains. | ||
| 443 | /// | ||
| 444 | /// Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when | ||
| 445 | /// their scalar equivalents do not, so during vectorization it is important to | ||
| 446 | /// remove these lengthens and truncates when deciding the profitability of | ||
| 447 | /// vectorization. | ||
| 448 | /// | ||
| 449 | /// This function analyzes the given range of instructions and determines the | ||
| 450 | /// minimum type size each can be converted to. It attempts to remove or | ||
| 451 | /// minimize type size changes across each def-use chain, so for example in the | ||
| 452 | /// following code: | ||
| 453 | /// | ||
| 454 | ///   %1 = load i8, i8* | ||
| 455 | ///   %2 = add i8 %1, 2 | ||
| 456 | ///   %3 = load i16, i16* | ||
| 457 | ///   %4 = zext i8 %2 to i32 | ||
| 458 | ///   %5 = zext i16 %3 to i32 | ||
| 459 | ///   %6 = add i32 %4, %5 | ||
| 460 | ///   %7 = trunc i32 %6 to i16 | ||
| 461 | /// | ||
| 462 | /// Instruction %6 must be done at least in i16, so computeMinimumValueSizes | ||
| 463 | /// will return: {%1: 16, %2: 16, %3: 16, %4: 16, %5: 16, %6: 16, %7: 16}. | ||
| 464 | /// | ||
| 465 | /// If the optional TargetTransformInfo is provided, this function tries harder | ||
| 466 | /// to do less work by only looking at illegal types. | ||
| 467 | MapVector<Instruction*, uint64_t> | ||
| 468 | computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks, | ||
| 469 |                          DemandedBits &DB, | ||
| 470 | const TargetTransformInfo *TTI=nullptr); | ||
| 471 | |||
| 472 | /// Compute the union of two access-group lists. | ||
| 473 | /// | ||
| 474 | /// If the list contains just one access group, it is returned directly. If the | ||
| 475 | /// list is empty, returns nullptr. | ||
| 476 | MDNode *uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2); | ||
| 477 | |||
| 478 | /// Compute the access-group list of access groups that @p Inst1 and @p Inst2 | ||
| 479 | /// are both in. If either instruction does not access memory at all, it is | ||
| 480 | /// considered to be in every list. | ||
| 481 | /// | ||
| 482 | /// If the list contains just one access group, it is returned directly. If the | ||
| 483 | /// list is empty, returns nullptr. | ||
| 484 | MDNode *intersectAccessGroups(const Instruction *Inst1, | ||
| 485 | const Instruction *Inst2); | ||
| 486 | |||
| 487 | /// Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, | ||
| 488 | /// MD_nontemporal, MD_access_group]. | ||
| 489 | /// For K in Kinds, we get the MDNode for K from each of the | ||
| 490 | /// elements of VL, compute their "intersection" (i.e., the most generic | ||
| 491 | /// metadata value that covers all of the individual values), and set I's | ||
| 492 | /// metadata for M equal to the intersection value. | ||
| 493 | /// | ||
| 494 | /// This function always sets a (possibly null) value for each K in Kinds. | ||
| 495 | Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL); | ||
| 496 | |||
| 497 | /// Create a mask that filters the members of an interleave group where there | ||
| 498 | /// are gaps. | ||
| 499 | /// | ||
| 500 | /// For example, the mask for \p Group with interleave-factor 3 | ||
| 501 | /// and \p VF 4, that has only its first member present is: | ||
| 502 | /// | ||
| 503 | ///   <1,0,0,1,0,0,1,0,0,1,0,0> | ||
| 504 | /// | ||
| 505 | /// Note: The result is a mask of 0's and 1's, as opposed to the other | ||
| 506 | /// create[*]Mask() utilities which create a shuffle mask (mask that | ||
| 507 | /// consists of indices). | ||
| 508 | Constant *createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, | ||
| 509 | const InterleaveGroup<Instruction> &Group); | ||
| 510 | |||
| 511 | /// Create a mask with replicated elements. | ||
| 512 | /// | ||
| 513 | /// This function creates a shuffle mask for replicating each of the \p VF | ||
| 514 | /// elements in a vector \p ReplicationFactor times. It can be used to | ||
| 515 | /// transform a mask of \p VF elements into a mask of | ||
| 516 | /// \p VF * \p ReplicationFactor elements used by a predicated | ||
| 517 | /// interleaved-group of loads/stores whose Interleaved-factor == | ||
| 518 | /// \p ReplicationFactor. | ||
| 519 | /// | ||
| 520 | /// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: | ||
| 521 | /// | ||
| 522 | ///   <0,0,0,1,1,1,2,2,2,3,3,3> | ||
| 523 | llvm::SmallVector<int, 16> createReplicatedMask(unsigned ReplicationFactor, | ||
| 524 | unsigned VF); | ||
| 525 | |||
| 526 | /// Create an interleave shuffle mask. | ||
| 527 | /// | ||
| 528 | /// This function creates a shuffle mask for interleaving \p NumVecs vectors of | ||
| 529 | /// vectorization factor \p VF into a single wide vector. The mask is of the | ||
| 530 | /// form: | ||
| 531 | /// | ||
| 532 | ///   <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...> | ||
| 533 | /// | ||
| 534 | /// For example, the mask for VF = 4 and NumVecs = 2 is: | ||
| 535 | /// | ||
| 536 | ///   <0, 4, 1, 5, 2, 6, 3, 7>. | ||
| 537 | llvm::SmallVector<int, 16> createInterleaveMask(unsigned VF, unsigned NumVecs); | ||
| 538 | |||
| 539 | /// Create a stride shuffle mask. | ||
| 540 | /// | ||
| 541 | /// This function creates a shuffle mask whose elements begin at \p Start and | ||
| 542 | /// are incremented by \p Stride. The mask can be used to deinterleave an | ||
| 543 | /// interleaved vector into separate vectors of vectorization factor \p VF. The | ||
| 544 | /// mask is of the form: | ||
| 545 | /// | ||
| 546 | ///   <Start, Start + Stride, ..., Start + Stride * (VF - 1)> | ||
| 547 | /// | ||
| 548 | /// For example, the mask for Start = 0, Stride = 2, and VF = 4 is: | ||
| 549 | /// | ||
| 550 | ///   <0, 2, 4, 6> | ||
| 551 | llvm::SmallVector<int, 16> createStrideMask(unsigned Start, unsigned Stride, | ||
| 552 | unsigned VF); | ||
| 553 | |||
| 554 | /// Create a sequential shuffle mask. | ||
| 555 | /// | ||
| 556 | /// This function creates shuffle mask whose elements are sequential and begin | ||
| 557 | /// at \p Start.  The mask contains \p NumInts integers and is padded with \p | ||
| 558 | /// NumUndefs undef values. The mask is of the form: | ||
| 559 | /// | ||
| 560 | ///   <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs> | ||
| 561 | /// | ||
| 562 | /// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is: | ||
| 563 | /// | ||
| 564 | ///   <0, 1, 2, 3, undef, undef, undef, undef> | ||
| 565 | llvm::SmallVector<int, 16> | ||
| 566 | createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs); | ||
| 567 | |||
| 568 | /// Given a shuffle mask for a binary shuffle, create the equivalent shuffle | ||
| 569 | /// mask assuming both operands are identical. This assumes that the unary | ||
| 570 | /// shuffle will use elements from operand 0 (operand 1 will be unused). | ||
| 571 | llvm::SmallVector<int, 16> createUnaryMask(ArrayRef<int> Mask, | ||
| 572 | unsigned NumElts); | ||
| 573 | |||
| 574 | /// Concatenate a list of vectors. | ||
| 575 | /// | ||
| 576 | /// This function generates code that concatenate the vectors in \p Vecs into a | ||
| 577 | /// single large vector. The number of vectors should be greater than one, and | ||
| 578 | /// their element types should be the same. The number of elements in the | ||
| 579 | /// vectors should also be the same; however, if the last vector has fewer | ||
| 580 | /// elements, it will be padded with undefs. | ||
| 581 | Value *concatenateVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vecs); | ||
| 582 | |||
| 583 | /// Given a mask vector of i1, Return true if all of the elements of this | ||
| 584 | /// predicate mask are known to be false or undef.  That is, return true if all | ||
| 585 | /// lanes can be assumed inactive. | ||
| 586 | bool maskIsAllZeroOrUndef(Value *Mask); | ||
| 587 | |||
| 588 | /// Given a mask vector of i1, Return true if all of the elements of this | ||
| 589 | /// predicate mask are known to be true or undef.  That is, return true if all | ||
| 590 | /// lanes can be assumed active. | ||
| 591 | bool maskIsAllOneOrUndef(Value *Mask); | ||
| 592 | |||
| 593 | /// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y) | ||
| 594 | /// for each lane which may be active. | ||
| 595 | APInt possiblyDemandedEltsInMask(Value *Mask); | ||
| 596 | |||
| 597 | /// The group of interleaved loads/stores sharing the same stride and | ||
| 598 | /// close to each other. | ||
| 599 | /// | ||
| 600 | /// Each member in this group has an index starting from 0, and the largest | ||
| 601 | /// index should be less than interleaved factor, which is equal to the absolute | ||
| 602 | /// value of the access's stride. | ||
| 603 | /// | ||
| 604 | /// E.g. An interleaved load group of factor 4: | ||
| 605 | ///        for (unsigned i = 0; i < 1024; i+=4) { | ||
| 606 | ///          a = A[i];                           // Member of index 0 | ||
| 607 | ///          b = A[i+1];                         // Member of index 1 | ||
| 608 | ///          d = A[i+3];                         // Member of index 3 | ||
| 609 | ///          ... | ||
| 610 | ///        } | ||
| 611 | /// | ||
| 612 | ///      An interleaved store group of factor 4: | ||
| 613 | ///        for (unsigned i = 0; i < 1024; i+=4) { | ||
| 614 | ///          ... | ||
| 615 | ///          A[i]   = a;                         // Member of index 0 | ||
| 616 | ///          A[i+1] = b;                         // Member of index 1 | ||
| 617 | ///          A[i+2] = c;                         // Member of index 2 | ||
| 618 | ///          A[i+3] = d;                         // Member of index 3 | ||
| 619 | ///        } | ||
| 620 | /// | ||
| 621 | /// Note: the interleaved load group could have gaps (missing members), but | ||
| 622 | /// the interleaved store group doesn't allow gaps. | ||
| 623 | template <typename InstTy> class InterleaveGroup { | ||
| 624 | public: | ||
| 625 | InterleaveGroup(uint32_t Factor, bool Reverse, Align Alignment) | ||
| 626 | : Factor(Factor), Reverse(Reverse), Alignment(Alignment), | ||
| 627 | InsertPos(nullptr) {} | ||
| 628 | |||
| 629 | InterleaveGroup(InstTy *Instr, int32_t Stride, Align Alignment) | ||
| 630 | : Alignment(Alignment), InsertPos(Instr) { | ||
| 631 | Factor = std::abs(Stride); | ||
| 632 | assert(Factor > 1 && "Invalid interleave factor"); | ||
| 633 | |||
| 634 | Reverse = Stride < 0; | ||
| 635 | Members[0] = Instr; | ||
| 636 |   } | ||
| 637 | |||
| 638 | bool isReverse() const { return Reverse; } | ||
| 639 | uint32_t getFactor() const { return Factor; } | ||
| 640 | Align getAlign() const { return Alignment; } | ||
| 641 | uint32_t getNumMembers() const { return Members.size(); } | ||
| 642 | |||
| 643 |   /// Try to insert a new member \p Instr with index \p Index and | ||
| 644 |   /// alignment \p NewAlign. The index is related to the leader and it could be | ||
| 645 |   /// negative if it is the new leader. | ||
| 646 |   /// | ||
| 647 |   /// \returns false if the instruction doesn't belong to the group. | ||
| 648 | bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign) { | ||
| 649 |     // Make sure the key fits in an int32_t. | ||
| 650 | std::optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey); | ||
| 651 | if (!MaybeKey) | ||
| 652 | return false; | ||
| 653 | int32_t Key = *MaybeKey; | ||
| 654 | |||
| 655 |     // Skip if the key is used for either the tombstone or empty special values. | ||
| 656 | if (DenseMapInfo<int32_t>::getTombstoneKey() == Key || | ||
| 657 | DenseMapInfo<int32_t>::getEmptyKey() == Key) | ||
| 658 | return false; | ||
| 659 | |||
| 660 |     // Skip if there is already a member with the same index. | ||
| 661 | if (Members.find(Key) != Members.end()) | ||
| 662 | return false; | ||
| 663 | |||
| 664 | if (Key > LargestKey) { | ||
| 665 |       // The largest index is always less than the interleave factor. | ||
| 666 | if (Index >= static_cast<int32_t>(Factor)) | ||
| 667 | return false; | ||
| 668 | |||
| 669 | LargestKey = Key; | ||
| 670 | } else if (Key < SmallestKey) { | ||
| 671 | |||
| 672 |       // Make sure the largest index fits in an int32_t. | ||
| 673 | std::optional<int32_t> MaybeLargestIndex = checkedSub(LargestKey, Key); | ||
| 674 | if (!MaybeLargestIndex) | ||
| 675 | return false; | ||
| 676 | |||
| 677 |       // The largest index is always less than the interleave factor. | ||
| 678 | if (*MaybeLargestIndex >= static_cast<int64_t>(Factor)) | ||
| 679 | return false; | ||
| 680 | |||
| 681 | SmallestKey = Key; | ||
| 682 |     } | ||
| 683 | |||
| 684 |     // It's always safe to select the minimum alignment. | ||
| 685 | Alignment = std::min(Alignment, NewAlign); | ||
| 686 | Members[Key] = Instr; | ||
| 687 | return true; | ||
| 688 |   } | ||
| 689 | |||
| 690 |   /// Get the member with the given index \p Index | ||
| 691 |   /// | ||
| 692 |   /// \returns nullptr if contains no such member. | ||
| 693 | InstTy *getMember(uint32_t Index) const { | ||
| 694 | int32_t Key = SmallestKey + Index; | ||
| 695 | return Members.lookup(Key); | ||
| 696 |   } | ||
| 697 | |||
| 698 |   /// Get the index for the given member. Unlike the key in the member | ||
| 699 |   /// map, the index starts from 0. | ||
| 700 | uint32_t getIndex(const InstTy *Instr) const { | ||
| 701 | for (auto I : Members) { | ||
| 702 | if (I.second == Instr) | ||
| 703 | return I.first - SmallestKey; | ||
| 704 |     } | ||
| 705 | |||
| 706 | llvm_unreachable("InterleaveGroup contains no such member"); | ||
| 707 |   } | ||
| 708 | |||
| 709 | InstTy *getInsertPos() const { return InsertPos; } | ||
| 710 | void setInsertPos(InstTy *Inst) { InsertPos = Inst; } | ||
| 711 | |||
| 712 |   /// Add metadata (e.g. alias info) from the instructions in this group to \p | ||
| 713 |   /// NewInst. | ||
| 714 |   /// | ||
| 715 |   /// FIXME: this function currently does not add noalias metadata a'la | ||
| 716 |   /// addNewMedata.  To do that we need to compute the intersection of the | ||
| 717 |   /// noalias info from all members. | ||
| 718 | void addMetadata(InstTy *NewInst) const; | ||
| 719 | |||
| 720 |   /// Returns true if this Group requires a scalar iteration to handle gaps. | ||
| 721 | bool requiresScalarEpilogue() const { | ||
| 722 |     // If the last member of the Group exists, then a scalar epilog is not | ||
| 723 |     // needed for this group. | ||
| 724 | if (getMember(getFactor() - 1)) | ||
| 725 | return false; | ||
| 726 | |||
| 727 |     // We have a group with gaps. It therefore can't be a reversed access, | ||
| 728 |     // because such groups get invalidated (TODO). | ||
| 729 | assert(!isReverse() && "Group should have been invalidated"); | ||
| 730 | |||
| 731 |     // This is a group of loads, with gaps, and without a last-member | ||
| 732 | return true; | ||
| 733 |   } | ||
| 734 | |||
| 735 | private: | ||
| 736 | uint32_t Factor; // Interleave Factor. | ||
| 737 | bool Reverse; | ||
| 738 |   Align Alignment; | ||
| 739 | DenseMap<int32_t, InstTy *> Members; | ||
| 740 | int32_t SmallestKey = 0; | ||
| 741 | int32_t LargestKey = 0; | ||
| 742 | |||
| 743 |   // To avoid breaking dependences, vectorized instructions of an interleave | ||
| 744 |   // group should be inserted at either the first load or the last store in | ||
| 745 |   // program order. | ||
| 746 |   // | ||
| 747 |   // E.g. %even = load i32             // Insert Position | ||
| 748 |   //      %add = add i32 %even         // Use of %even | ||
| 749 |   //      %odd = load i32 | ||
| 750 |   // | ||
| 751 |   //      store i32 %even | ||
| 752 |   //      %odd = add i32               // Def of %odd | ||
| 753 |   //      store i32 %odd               // Insert Position | ||
| 754 | InstTy *InsertPos; | ||
| 755 | }; | ||
| 756 | |||
| 757 | /// Drive the analysis of interleaved memory accesses in the loop. | ||
| 758 | /// | ||
| 759 | /// Use this class to analyze interleaved accesses only when we can vectorize | ||
| 760 | /// a loop. Otherwise it's meaningless to do analysis as the vectorization | ||
| 761 | /// on interleaved accesses is unsafe. | ||
| 762 | /// | ||
| 763 | /// The analysis collects interleave groups and records the relationships | ||
| 764 | /// between the member and the group in a map. | ||
| 765 | class InterleavedAccessInfo { | ||
| 766 | public: | ||
| 767 | InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, | ||
| 768 | DominatorTree *DT, LoopInfo *LI, | ||
| 769 | const LoopAccessInfo *LAI) | ||
| 770 | : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} | ||
| 771 | |||
| 772 | ~InterleavedAccessInfo() { invalidateGroups(); } | ||
| 773 | |||
| 774 |   /// Analyze the interleaved accesses and collect them in interleave | ||
| 775 |   /// groups. Substitute symbolic strides using \p Strides. | ||
| 776 |   /// Consider also predicated loads/stores in the analysis if | ||
| 777 |   /// \p EnableMaskedInterleavedGroup is true. | ||
| 778 | void analyzeInterleaving(bool EnableMaskedInterleavedGroup); | ||
| 779 | |||
| 780 |   /// Invalidate groups, e.g., in case all blocks in loop will be predicated | ||
| 781 |   /// contrary to original assumption. Although we currently prevent group | ||
| 782 |   /// formation for predicated accesses, we may be able to relax this limitation | ||
| 783 |   /// in the future once we handle more complicated blocks. Returns true if any | ||
| 784 |   /// groups were invalidated. | ||
| 785 | bool invalidateGroups() { | ||
| 786 | if (InterleaveGroups.empty()) { | ||
| 787 | assert( | ||
| 788 | !RequiresScalarEpilogue && | ||
| 789 | "RequiresScalarEpilog should not be set without interleave groups"); | ||
| 790 | return false; | ||
| 791 |     } | ||
| 792 | |||
| 793 | InterleaveGroupMap.clear(); | ||
| 794 | for (auto *Ptr : InterleaveGroups) | ||
| 795 | delete Ptr; | ||
| 796 | InterleaveGroups.clear(); | ||
| 797 | RequiresScalarEpilogue = false; | ||
| 798 | return true; | ||
| 799 |   } | ||
| 800 | |||
| 801 |   /// Check if \p Instr belongs to any interleave group. | ||
| 802 | bool isInterleaved(Instruction *Instr) const { | ||
| 803 | return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end(); | ||
| 804 |   } | ||
| 805 | |||
| 806 |   /// Get the interleave group that \p Instr belongs to. | ||
| 807 |   /// | ||
| 808 |   /// \returns nullptr if doesn't have such group. | ||
| 809 | InterleaveGroup<Instruction> * | ||
| 810 | getInterleaveGroup(const Instruction *Instr) const { | ||
| 811 | return InterleaveGroupMap.lookup(Instr); | ||
| 812 |   } | ||
| 813 | |||
| 814 | iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>> | ||
| 815 | getInterleaveGroups() { | ||
| 816 | return make_range(InterleaveGroups.begin(), InterleaveGroups.end()); | ||
| 817 |   } | ||
| 818 | |||
| 819 |   /// Returns true if an interleaved group that may access memory | ||
| 820 |   /// out-of-bounds requires a scalar epilogue iteration for correctness. | ||
| 821 | bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; } | ||
| 822 | |||
| 823 |   /// Invalidate groups that require a scalar epilogue (due to gaps). This can | ||
| 824 |   /// happen when optimizing for size forbids a scalar epilogue, and the gap | ||
| 825 |   /// cannot be filtered by masking the load/store. | ||
| 826 | void invalidateGroupsRequiringScalarEpilogue(); | ||
| 827 | |||
| 828 |   /// Returns true if we have any interleave groups. | ||
| 829 | bool hasGroups() const { return !InterleaveGroups.empty(); } | ||
| 830 | |||
| 831 | private: | ||
| 832 |   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. | ||
| 833 |   /// Simplifies SCEV expressions in the context of existing SCEV assumptions. | ||
| 834 |   /// The interleaved access analysis can also add new predicates (for example | ||
| 835 |   /// by versioning strides of pointers). | ||
| 836 | PredicatedScalarEvolution &PSE; | ||
| 837 | |||
| 838 | Loop *TheLoop; | ||
| 839 | DominatorTree *DT; | ||
| 840 | LoopInfo *LI; | ||
| 841 | const LoopAccessInfo *LAI; | ||
| 842 | |||
| 843 |   /// True if the loop may contain non-reversed interleaved groups with | ||
| 844 |   /// out-of-bounds accesses. We ensure we don't speculatively access memory | ||
| 845 |   /// out-of-bounds by executing at least one scalar epilogue iteration. | ||
| 846 | bool RequiresScalarEpilogue = false; | ||
| 847 | |||
| 848 |   /// Holds the relationships between the members and the interleave group. | ||
| 849 | DenseMap<Instruction *, InterleaveGroup<Instruction> *> InterleaveGroupMap; | ||
| 850 | |||
| 851 | SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups; | ||
| 852 | |||
| 853 |   /// Holds dependences among the memory accesses in the loop. It maps a source | ||
| 854 |   /// access to a set of dependent sink accesses. | ||
| 855 | DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences; | ||
| 856 | |||
| 857 |   /// The descriptor for a strided memory access. | ||
| 858 | struct StrideDescriptor { | ||
| 859 | StrideDescriptor() = default; | ||
| 860 | StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size, | ||
| 861 |                      Align Alignment) | ||
| 862 | : Stride(Stride), Scev(Scev), Size(Size), Alignment(Alignment) {} | ||
| 863 | |||
| 864 |     // The access's stride. It is negative for a reverse access. | ||
| 865 | int64_t Stride = 0; | ||
| 866 | |||
| 867 |     // The scalar expression of this access. | ||
| 868 | const SCEV *Scev = nullptr; | ||
| 869 | |||
| 870 |     // The size of the memory object. | ||
| 871 | uint64_t Size = 0; | ||
| 872 | |||
| 873 |     // The alignment of this access. | ||
| 874 |     Align Alignment; | ||
| 875 | }; | ||
| 876 | |||
| 877 |   /// A type for holding instructions and their stride descriptors. | ||
| 878 | using StrideEntry = std::pair<Instruction *, StrideDescriptor>; | ||
| 879 | |||
| 880 |   /// Create a new interleave group with the given instruction \p Instr, | ||
| 881 |   /// stride \p Stride and alignment \p Align. | ||
| 882 |   /// | ||
| 883 |   /// \returns the newly created interleave group. | ||
| 884 | InterleaveGroup<Instruction> * | ||
| 885 | createInterleaveGroup(Instruction *Instr, int Stride, Align Alignment) { | ||
| 886 | assert(!InterleaveGroupMap.count(Instr) && | ||
| 887 | "Already in an interleaved access group"); | ||
| 888 | InterleaveGroupMap[Instr] = | ||
| 889 | new InterleaveGroup<Instruction>(Instr, Stride, Alignment); | ||
| 890 | InterleaveGroups.insert(InterleaveGroupMap[Instr]); | ||
| 891 | return InterleaveGroupMap[Instr]; | ||
| 892 |   } | ||
| 893 | |||
| 894 |   /// Release the group and remove all the relationships. | ||
| 895 | void releaseGroup(InterleaveGroup<Instruction> *Group) { | ||
| 896 | for (unsigned i = 0; i < Group->getFactor(); i++) | ||
| 897 | if (Instruction *Member = Group->getMember(i)) | ||
| 898 | InterleaveGroupMap.erase(Member); | ||
| 899 | |||
| 900 | InterleaveGroups.erase(Group); | ||
| 901 | delete Group; | ||
| 902 |   } | ||
| 903 | |||
| 904 |   /// Collect all the accesses with a constant stride in program order. | ||
| 905 | void collectConstStrideAccesses( | ||
| 906 | MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo, | ||
| 907 | const ValueToValueMap &Strides); | ||
| 908 | |||
| 909 |   /// Returns true if \p Stride is allowed in an interleaved group. | ||
| 910 | static bool isStrided(int Stride); | ||
| 911 | |||
| 912 |   /// Returns true if \p BB is a predicated block. | ||
| 913 | bool isPredicated(BasicBlock *BB) const { | ||
| 914 | return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); | ||
| 915 |   } | ||
| 916 | |||
| 917 |   /// Returns true if LoopAccessInfo can be used for dependence queries. | ||
| 918 | bool areDependencesValid() const { | ||
| 919 | return LAI && LAI->getDepChecker().getDependences(); | ||
| 920 |   } | ||
| 921 | |||
| 922 |   /// Returns true if memory accesses \p A and \p B can be reordered, if | ||
| 923 |   /// necessary, when constructing interleaved groups. | ||
| 924 |   /// | ||
| 925 |   /// \p A must precede \p B in program order. We return false if reordering is | ||
| 926 |   /// not necessary or is prevented because \p A and \p B may be dependent. | ||
| 927 | bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A, | ||
| 928 | StrideEntry *B) const { | ||
| 929 |     // Code motion for interleaved accesses can potentially hoist strided loads | ||
| 930 |     // and sink strided stores. The code below checks the legality of the | ||
| 931 |     // following two conditions: | ||
| 932 |     // | ||
| 933 |     // 1. Potentially moving a strided load (B) before any store (A) that | ||
| 934 |     //    precedes B, or | ||
| 935 |     // | ||
| 936 |     // 2. Potentially moving a strided store (A) after any load or store (B) | ||
| 937 |     //    that A precedes. | ||
| 938 |     // | ||
| 939 |     // It's legal to reorder A and B if we know there isn't a dependence from A | ||
| 940 |     // to B. Note that this determination is conservative since some | ||
| 941 |     // dependences could potentially be reordered safely. | ||
| 942 | |||
| 943 |     // A is potentially the source of a dependence. | ||
| 944 | auto *Src = A->first; | ||
| 945 | auto SrcDes = A->second; | ||
| 946 | |||
| 947 |     // B is potentially the sink of a dependence. | ||
| 948 | auto *Sink = B->first; | ||
| 949 | auto SinkDes = B->second; | ||
| 950 | |||
| 951 |     // Code motion for interleaved accesses can't violate WAR dependences. | ||
| 952 |     // Thus, reordering is legal if the source isn't a write. | ||
| 953 | if (!Src->mayWriteToMemory()) | ||
| 954 | return true; | ||
| 955 | |||
| 956 |     // At least one of the accesses must be strided. | ||
| 957 | if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride)) | ||
| 958 | return true; | ||
| 959 | |||
| 960 |     // If dependence information is not available from LoopAccessInfo, | ||
| 961 |     // conservatively assume the instructions can't be reordered. | ||
| 962 | if (!areDependencesValid()) | ||
| 963 | return false; | ||
| 964 | |||
| 965 |     // If we know there is a dependence from source to sink, assume the | ||
| 966 |     // instructions can't be reordered. Otherwise, reordering is legal. | ||
| 967 | return Dependences.find(Src) == Dependences.end() || | ||
| 968 | !Dependences.lookup(Src).count(Sink); | ||
| 969 |   } | ||
| 970 | |||
| 971 |   /// Collect the dependences from LoopAccessInfo. | ||
| 972 |   /// | ||
| 973 |   /// We process the dependences once during the interleaved access analysis to | ||
| 974 |   /// enable constant-time dependence queries. | ||
| 975 | void collectDependences() { | ||
| 976 | if (!areDependencesValid()) | ||
| 977 | return; | ||
| 978 | auto *Deps = LAI->getDepChecker().getDependences(); | ||
| 979 | for (auto Dep : *Deps) | ||
| 980 | Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI)); | ||
| 981 |   } | ||
| 982 | }; | ||
| 983 | |||
| 984 | } // llvm namespace | ||
| 985 | |||
| 986 | #endif |