WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/include/llvm/Analysis/VectorUtils.h

Rev	Author	Line No.	Line
14	pmbaty	1	//===- llvm/Analysis/VectorUtils.h - Vector utilities ------------ C++ --===//
		2	//
		3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	// See https://llvm.org/LICENSE.txt for license information.
		5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	//
		7	//===----------------------------------------------------------------------===//
		8	//
		9	// This file defines some vectorizer utilities.
		10	//
		11	//===----------------------------------------------------------------------===//
		12
		13	#ifndef LLVM_ANALYSIS_VECTORUTILS_H
		14	#define LLVM_ANALYSIS_VECTORUTILS_H
		15
		16	#include "llvm/ADT/MapVector.h"
		17	#include "llvm/ADT/SmallVector.h"
		18	#include "llvm/Analysis/LoopAccessAnalysis.h"
		19	#include "llvm/Support/CheckedArithmetic.h"
		20
		21	namespace llvm {
		22	class TargetLibraryInfo;
		23
		24	/// Describes the type of Parameters
		25	enum class VFParamKind {
		26	Vector, // No semantic information.
		27	OMP_Linear, // declare simd linear(i)
		28	OMP_LinearRef, // declare simd linear(ref(i))
		29	OMP_LinearVal, // declare simd linear(val(i))
		30	OMP_LinearUVal, // declare simd linear(uval(i))
		31	OMP_LinearPos, // declare simd linear(i:c) uniform(c)
		32	OMP_LinearValPos, // declare simd linear(val(i:c)) uniform(c)
		33	OMP_LinearRefPos, // declare simd linear(ref(i:c)) uniform(c)
		34	OMP_LinearUValPos, // declare simd linear(uval(i:c)) uniform(c)
		35	OMP_Uniform, // declare simd uniform(i)
		36	GlobalPredicate, // Global logical predicate that acts on all lanes
		37	// of the input and output mask concurrently. For
		38	// example, it is implied by the `M` token in the
		39	// Vector Function ABI mangled name.
		40	Unknown
		41	};
		42
		43	/// Describes the type of Instruction Set Architecture
		44	enum class VFISAKind {
		45	AdvancedSIMD, // AArch64 Advanced SIMD (NEON)
		46	SVE, // AArch64 Scalable Vector Extension
		47	SSE, // x86 SSE
		48	AVX, // x86 AVX
		49	AVX2, // x86 AVX2
		50	AVX512, // x86 AVX512
		51	LLVM, // LLVM internal ISA for functions that are not
		52	// attached to an existing ABI via name mangling.
		53	Unknown // Unknown ISA
		54	};
		55
		56	/// Encapsulates information needed to describe a parameter.
		57	///
		58	/// The description of the parameter is not linked directly to
		59	/// OpenMP or any other vector function description. This structure
		60	/// is extendible to handle other paradigms that describe vector
		61	/// functions and their parameters.
		62	struct VFParameter {
		63	unsigned ParamPos; // Parameter Position in Scalar Function.
		64	VFParamKind ParamKind; // Kind of Parameter.
		65	int LinearStepOrPos = 0; // Step or Position of the Parameter.
		66	Align Alignment = Align(); // Optional alignment in bytes, defaulted to 1.
		67
		68	// Comparison operator.
		69	bool operator==(const VFParameter &Other) const {
		70	return std::tie(ParamPos, ParamKind, LinearStepOrPos, Alignment) ==
		71	std::tie(Other.ParamPos, Other.ParamKind, Other.LinearStepOrPos,
		72	Other.Alignment);
		73	}
		74	};
		75
		76	/// Contains the information about the kind of vectorization
		77	/// available.
		78	///
		79	/// This object in independent on the paradigm used to
		80	/// represent vector functions. in particular, it is not attached to
		81	/// any target-specific ABI.
		82	struct VFShape {
		83	ElementCount VF; // Vectorization factor.
		84	SmallVector<VFParameter, 8> Parameters; // List of parameter information.
		85	// Comparison operator.
		86	bool operator==(const VFShape &Other) const {
		87	return std::tie(VF, Parameters) == std::tie(Other.VF, Other.Parameters);
		88	}
		89
		90	/// Update the parameter in position P.ParamPos to P.
		91	void updateParam(VFParameter P) {
		92	assert(P.ParamPos < Parameters.size() && "Invalid parameter position.");
		93	Parameters[P.ParamPos] = P;
		94	assert(hasValidParameterList() && "Invalid parameter list");
		95	}
		96
		97	// Retrieve the VFShape that can be used to map a (scalar) function to itself,
		98	// with VF = 1.
		99	static VFShape getScalarShape(const CallInst &CI) {
		100	return VFShape::get(CI, ElementCount::getFixed(1),
		101	/HasGlobalPredicate/ false);
		102	}
		103
		104	// Retrieve the basic vectorization shape of the function, where all
		105	// parameters are mapped to VFParamKind::Vector with \p EC
		106	// lanes. Specifies whether the function has a Global Predicate
		107	// argument via \p HasGlobalPred.
		108	static VFShape get(const CallInst &CI, ElementCount EC, bool HasGlobalPred) {
		109	SmallVector<VFParameter, 8> Parameters;
		110	for (unsigned I = 0; I < CI.arg_size(); ++I)
		111	Parameters.push_back(VFParameter({I, VFParamKind::Vector}));
		112	if (HasGlobalPred)
		113	Parameters.push_back(
		114	VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate}));
		115
		116	return {EC, Parameters};
		117	}
		118	/// Validation check on the Parameters in the VFShape.
		119	bool hasValidParameterList() const;
		120	};
		121
		122	/// Holds the VFShape for a specific scalar to vector function mapping.
		123	struct VFInfo {
		124	VFShape Shape; /// Classification of the vector function.
		125	std::string ScalarName; /// Scalar Function Name.
		126	std::string VectorName; /// Vector Function Name associated to this VFInfo.
		127	VFISAKind ISA; /// Instruction Set Architecture.
		128	};
		129
		130	namespace VFABI {
		131	/// LLVM Internal VFABI ISA token for vector functions.
		132	static constexpr char const *_LLVM_ = "_LLVM_";
		133	/// Prefix for internal name redirection for vector function that
		134	/// tells the compiler to scalarize the call using the scalar name
		135	/// of the function. For example, a mangled name like
		136	/// `_ZGV_LLVM_N2v_foo(_LLVM_Scalarize_foo)` would tell the
		137	/// vectorizer to vectorize the scalar call `foo`, and to scalarize
		138	/// it once vectorization is done.
		139	static constexpr char const *_LLVM_Scalarize_ = "_LLVM_Scalarize_";
		140
		141	/// Function to construct a VFInfo out of a mangled names in the
		142	/// following format:
		143	///
		144	/// <VFABI_name>{(<redirection>)}
		145	///
		146	/// where <VFABI_name> is the name of the vector function, mangled according
		147	/// to the rules described in the Vector Function ABI of the target vector
		148	/// extension (or <isa> from now on). The <VFABI_name> is in the following
		149	/// format:
		150	///
		151	/// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
		152	///
		153	/// This methods support demangling rules for the following <isa>:
		154	///
		155	/// * AArch64: https://developer.arm.com/docs/101129/latest
		156	///
		157	/// * x86 (libmvec): https://sourceware.org/glibc/wiki/libmvec and
		158	/// https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
		159	///
		160	/// \param MangledName -> input string in the format
		161	/// _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)].
		162	/// \param M -> Module used to retrieve informations about the vector
		163	/// function that are not possible to retrieve from the mangled
		164	/// name. At the moment, this parameter is needed only to retrieve the
		165	/// Vectorization Factor of scalable vector functions from their
		166	/// respective IR declarations.
		167	std::optional<VFInfo> tryDemangleForVFABI(StringRef MangledName,
		168	const Module &M);
		169
		170	/// This routine mangles the given VectorName according to the LangRef
		171	/// specification for vector-function-abi-variant attribute and is specific to
		172	/// the TLI mappings. It is the responsibility of the caller to make sure that
		173	/// this is only used if all parameters in the vector function are vector type.
		174	/// This returned string holds scalar-to-vector mapping:
		175	/// _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
		176	///
		177	/// where:
		178	///
		179	/// <isa> = "_LLVM_"
		180	/// <mask> = "N". Note: TLI does not support masked interfaces.
		181	/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
		182	/// field of the `VecDesc` struct. If the number of lanes is scalable
		183	/// then 'x' is printed instead.
		184	/// <vparams> = "v", as many as are the numArgs.
		185	/// <scalarname> = the name of the scalar function.
		186	/// <vectorname> = the name of the vector function.
		187	std::string mangleTLIVectorName(StringRef VectorName, StringRef ScalarName,
		188	unsigned numArgs, ElementCount VF);
		189
		190	/// Retrieve the `VFParamKind` from a string token.
		191	VFParamKind getVFParamKindFromString(const StringRef Token);
		192
		193	// Name of the attribute where the variant mappings are stored.
		194	static constexpr char const *MappingsAttrName = "vector-function-abi-variant";
		195
		196	/// Populates a set of strings representing the Vector Function ABI variants
		197	/// associated to the CallInst CI. If the CI does not contain the
		198	/// vector-function-abi-variant attribute, we return without populating
		199	/// VariantMappings, i.e. callers of getVectorVariantNames need not check for
		200	/// the presence of the attribute (see InjectTLIMappings).
		201	void getVectorVariantNames(const CallInst &CI,
		202	SmallVectorImpl<std::string> &VariantMappings);
		203	} // end namespace VFABI
		204
		205	/// The Vector Function Database.
		206	///
		207	/// Helper class used to find the vector functions associated to a
		208	/// scalar CallInst.
		209	class VFDatabase {
		210	/// The Module of the CallInst CI.
		211	const Module *M;
		212	/// The CallInst instance being queried for scalar to vector mappings.
		213	const CallInst &CI;
		214	/// List of vector functions descriptors associated to the call
		215	/// instruction.
		216	const SmallVector<VFInfo, 8> ScalarToVectorMappings;
		217
		218	/// Retrieve the scalar-to-vector mappings associated to the rule of
		219	/// a vector Function ABI.
		220	static void getVFABIMappings(const CallInst &CI,
		221	SmallVectorImpl<VFInfo> &Mappings) {
		222	if (!CI.getCalledFunction())
		223	return;
		224
		225	const StringRef ScalarName = CI.getCalledFunction()->getName();
		226
		227	SmallVector<std::string, 8> ListOfStrings;
		228	// The check for the vector-function-abi-variant attribute is done when
		229	// retrieving the vector variant names here.
		230	VFABI::getVectorVariantNames(CI, ListOfStrings);
		231	if (ListOfStrings.empty())
		232	return;
		233	for (const auto &MangledName : ListOfStrings) {
		234	const std::optional<VFInfo> Shape =
		235	VFABI::tryDemangleForVFABI(MangledName, *(CI.getModule()));
		236	// A match is found via scalar and vector names, and also by
		237	// ensuring that the variant described in the attribute has a
		238	// corresponding definition or declaration of the vector
		239	// function in the Module M.
		240	if (Shape && (Shape->ScalarName == ScalarName)) {
		241	assert(CI.getModule()->getFunction(Shape->VectorName) &&
		242	"Vector function is missing.");
		243	Mappings.push_back(*Shape);
		244	}
		245	}
		246	}
		247
		248	public:
		249	/// Retrieve all the VFInfo instances associated to the CallInst CI.
		250	static SmallVector<VFInfo, 8> getMappings(const CallInst &CI) {
		251	SmallVector<VFInfo, 8> Ret;
		252
		253	// Get mappings from the Vector Function ABI variants.
		254	getVFABIMappings(CI, Ret);
		255
		256	// Other non-VFABI variants should be retrieved here.
		257
		258	return Ret;
		259	}
		260
		261	/// Constructor, requires a CallInst instance.
		262	VFDatabase(CallInst &CI)
		263	: M(CI.getModule()), CI(CI),
		264	ScalarToVectorMappings(VFDatabase::getMappings(CI)) {}
		265	/// \defgroup VFDatabase query interface.
		266	///
		267	/// @{
		268	/// Retrieve the Function with VFShape \p Shape.
		269	Function *getVectorizedFunction(const VFShape &Shape) const {
		270	if (Shape == VFShape::getScalarShape(CI))
		271	return CI.getCalledFunction();
		272
		273	for (const auto &Info : ScalarToVectorMappings)
		274	if (Info.Shape == Shape)
		275	return M->getFunction(Info.VectorName);
		276
		277	return nullptr;
		278	}
		279	/// @}
		280	};
		281
		282	template <typename T> class ArrayRef;
		283	class DemandedBits;
		284	class GetElementPtrInst;
		285	template <typename InstTy> class InterleaveGroup;
		286	class IRBuilderBase;
		287	class Loop;
		288	class ScalarEvolution;
		289	class TargetTransformInfo;
		290	class Type;
		291	class Value;
		292
		293	namespace Intrinsic {
		294	typedef unsigned ID;
		295	}
		296
		297	/// A helper function for converting Scalar types to vector types. If
		298	/// the incoming type is void, we return void. If the EC represents a
		299	/// scalar, we return the scalar type.
		300	inline Type ToVectorTy(Type Scalar, ElementCount EC) {
		301	if (Scalar->isVoidTy() \|\| Scalar->isMetadataTy() \|\| EC.isScalar())
		302	return Scalar;
		303	return VectorType::get(Scalar, EC);
		304	}
		305
		306	inline Type ToVectorTy(Type Scalar, unsigned VF) {
		307	return ToVectorTy(Scalar, ElementCount::getFixed(VF));
		308	}
		309
		310	/// Identify if the intrinsic is trivially vectorizable.
		311	/// This method returns true if the intrinsic's argument types are all scalars
		312	/// for the scalar form of the intrinsic and all vectors (or scalars handled by
		313	/// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic.
		314	bool isTriviallyVectorizable(Intrinsic::ID ID);
		315
		316	/// Identifies if the vector form of the intrinsic has a scalar operand.
		317	bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
		318	unsigned ScalarOpdIdx);
		319
		320	/// Identifies if the vector form of the intrinsic has a operand that has
		321	/// an overloaded type.
		322	bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, unsigned OpdIdx);
		323
		324	/// Returns intrinsic ID for call.
		325	/// For the input call instruction it finds mapping intrinsic and returns
		326	/// its intrinsic ID, in case it does not found it return not_intrinsic.
		327	Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
		328	const TargetLibraryInfo *TLI);
		329
		330	/// Find the operand of the GEP that should be checked for consecutive
		331	/// stores. This ignores trailing indices that have no effect on the final
		332	/// pointer.
		333	unsigned getGEPInductionOperand(const GetElementPtrInst *Gep);
		334
		335	/// If the argument is a GEP, then returns the operand identified by
		336	/// getGEPInductionOperand. However, if there is some other non-loop-invariant
		337	/// operand, it returns that instead.
		338	Value stripGetElementPtr(Value Ptr, ScalarEvolution SE, Loop Lp);
		339
		340	/// If a value has only one user that is a CastInst, return it.
		341	Value getUniqueCastUse(Value Ptr, Loop Lp, Type Ty);
		342
		343	/// Get the stride of a pointer access in a loop. Looks for symbolic
		344	/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
		345	Value getStrideFromPointer(Value Ptr, ScalarEvolution SE, Loop Lp);
		346
		347	/// Given a vector and an element number, see if the scalar value is
		348	/// already around as a register, for example if it were inserted then extracted
		349	/// from the vector.
		350	Value findScalarElement(Value V, unsigned EltNo);
		351
		352	/// If all non-negative \p Mask elements are the same value, return that value.
		353	/// If all elements are negative (undefined) or \p Mask contains different
		354	/// non-negative values, return -1.
		355	int getSplatIndex(ArrayRef<int> Mask);
		356
		357	/// Get splat value if the input is a splat vector or return nullptr.
		358	/// The value may be extracted from a splat constants vector or from
		359	/// a sequence of instructions that broadcast a single value into a vector.
		360	Value getSplatValue(const Value V);
		361
		362	/// Return true if each element of the vector value \p V is poisoned or equal to
		363	/// every other non-poisoned element. If an index element is specified, either
		364	/// every element of the vector is poisoned or the element at that index is not
		365	/// poisoned and equal to every other non-poisoned element.
		366	/// This may be more powerful than the related getSplatValue() because it is
		367	/// not limited by finding a scalar source value to a splatted vector.
		368	bool isSplatValue(const Value *V, int Index = -1, unsigned Depth = 0);
		369
		370	/// Transform a shuffle mask's output demanded element mask into demanded
		371	/// element masks for the 2 operands, returns false if the mask isn't valid.
		372	/// Both \p DemandedLHS and \p DemandedRHS are initialised to [SrcWidth].
		373	/// \p AllowUndefElts permits "-1" indices to be treated as undef.
		374	bool getShuffleDemandedElts(int SrcWidth, ArrayRef<int> Mask,
		375	const APInt &DemandedElts, APInt &DemandedLHS,
		376	APInt &DemandedRHS, bool AllowUndefElts = false);
		377
		378	/// Replace each shuffle mask index with the scaled sequential indices for an
		379	/// equivalent mask of narrowed elements. Mask elements that are less than 0
		380	/// (sentinel values) are repeated in the output mask.
		381	///
		382	/// Example with Scale = 4:
		383	/// <4 x i32> <3, 2, 0, -1> -->
		384	/// <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1>
		385	///
		386	/// This is the reverse process of widening shuffle mask elements, but it always
		387	/// succeeds because the indexes can always be multiplied (scaled up) to map to
		388	/// narrower vector elements.
		389	void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
		390	SmallVectorImpl<int> &ScaledMask);
		391
		392	/// Try to transform a shuffle mask by replacing elements with the scaled index
		393	/// for an equivalent mask of widened elements. If all mask elements that would
		394	/// map to a wider element of the new mask are the same negative number
		395	/// (sentinel value), that element of the new mask is the same value. If any
		396	/// element in a given slice is negative and some other element in that slice is
		397	/// not the same value, return false (partial matches with sentinel values are
		398	/// not allowed).
		399	///
		400	/// Example with Scale = 4:
		401	/// <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> -->
		402	/// <4 x i32> <3, 2, 0, -1>
		403	///
		404	/// This is the reverse process of narrowing shuffle mask elements if it
		405	/// succeeds. This transform is not always possible because indexes may not
		406	/// divide evenly (scale down) to map to wider vector elements.
		407	bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
		408	SmallVectorImpl<int> &ScaledMask);
		409
		410	/// Repetitively apply `widenShuffleMaskElts()` for as long as it succeeds,
		411	/// to get the shuffle mask with widest possible elements.
		412	void getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
		413	SmallVectorImpl<int> &ScaledMask);
		414
		415	/// Splits and processes shuffle mask depending on the number of input and
		416	/// output registers. The function does 2 main things: 1) splits the
		417	/// source/destination vectors into real registers; 2) do the mask analysis to
		418	/// identify which real registers are permuted. Then the function processes
		419	/// resulting registers mask using provided action items. If no input register
		420	/// is defined, \p NoInputAction action is used. If only 1 input register is
		421	/// used, \p SingleInputAction is used, otherwise \p ManyInputsAction is used to
		422	/// process > 2 input registers and masks.
		423	/// \param Mask Original shuffle mask.
		424	/// \param NumOfSrcRegs Number of source registers.
		425	/// \param NumOfDestRegs Number of destination registers.
		426	/// \param NumOfUsedRegs Number of actually used destination registers.
		427	void processShuffleMasks(
		428	ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
		429	unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
		430	function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
		431	function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction);
		432
		433	/// Compute a map of integer instructions to their minimum legal type
		434	/// size.
		435	///
		436	/// C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int
		437	/// type (e.g. i32) whenever arithmetic is performed on them.
		438	///
		439	/// For targets with native i8 or i16 operations, usually InstCombine can shrink
		440	/// the arithmetic type down again. However InstCombine refuses to create
		441	/// illegal types, so for targets without i8 or i16 registers, the lengthening
		442	/// and shrinking remains.
		443	///
		444	/// Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when
		445	/// their scalar equivalents do not, so during vectorization it is important to
		446	/// remove these lengthens and truncates when deciding the profitability of
		447	/// vectorization.
		448	///
		449	/// This function analyzes the given range of instructions and determines the
		450	/// minimum type size each can be converted to. It attempts to remove or
		451	/// minimize type size changes across each def-use chain, so for example in the
		452	/// following code:
		453	///
		454	/// %1 = load i8, i8*
		455	/// %2 = add i8 %1, 2
		456	/// %3 = load i16, i16*
		457	/// %4 = zext i8 %2 to i32
		458	/// %5 = zext i16 %3 to i32
		459	/// %6 = add i32 %4, %5
		460	/// %7 = trunc i32 %6 to i16
		461	///
		462	/// Instruction %6 must be done at least in i16, so computeMinimumValueSizes
		463	/// will return: {%1: 16, %2: 16, %3: 16, %4: 16, %5: 16, %6: 16, %7: 16}.
		464	///
		465	/// If the optional TargetTransformInfo is provided, this function tries harder
		466	/// to do less work by only looking at illegal types.
		467	MapVector<Instruction*, uint64_t>
		468	computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
		469	DemandedBits &DB,
		470	const TargetTransformInfo *TTI=nullptr);
		471
		472	/// Compute the union of two access-group lists.
		473	///
		474	/// If the list contains just one access group, it is returned directly. If the
		475	/// list is empty, returns nullptr.
		476	MDNode uniteAccessGroups(MDNode AccGroups1, MDNode *AccGroups2);
		477
		478	/// Compute the access-group list of access groups that @p Inst1 and @p Inst2
		479	/// are both in. If either instruction does not access memory at all, it is
		480	/// considered to be in every list.
		481	///
		482	/// If the list contains just one access group, it is returned directly. If the
		483	/// list is empty, returns nullptr.
		484	MDNode intersectAccessGroups(const Instruction Inst1,
		485	const Instruction *Inst2);
		486
		487	/// Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath,
		488	/// MD_nontemporal, MD_access_group].
		489	/// For K in Kinds, we get the MDNode for K from each of the
		490	/// elements of VL, compute their "intersection" (i.e., the most generic
		491	/// metadata value that covers all of the individual values), and set I's
		492	/// metadata for M equal to the intersection value.
		493	///
		494	/// This function always sets a (possibly null) value for each K in Kinds.
		495	Instruction propagateMetadata(Instruction I, ArrayRef<Value *> VL);
		496
		497	/// Create a mask that filters the members of an interleave group where there
		498	/// are gaps.
		499	///
		500	/// For example, the mask for \p Group with interleave-factor 3
		501	/// and \p VF 4, that has only its first member present is:
		502	///
		503	/// <1,0,0,1,0,0,1,0,0,1,0,0>
		504	///
		505	/// Note: The result is a mask of 0's and 1's, as opposed to the other
		506	/// create[*]Mask() utilities which create a shuffle mask (mask that
		507	/// consists of indices).
		508	Constant *createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
		509	const InterleaveGroup<Instruction> &Group);
		510
		511	/// Create a mask with replicated elements.
		512	///
		513	/// This function creates a shuffle mask for replicating each of the \p VF
		514	/// elements in a vector \p ReplicationFactor times. It can be used to
		515	/// transform a mask of \p VF elements into a mask of
		516	/// \p VF * \p ReplicationFactor elements used by a predicated
		517	/// interleaved-group of loads/stores whose Interleaved-factor ==
		518	/// \p ReplicationFactor.
		519	///
		520	/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
		521	///
		522	/// <0,0,0,1,1,1,2,2,2,3,3,3>
		523	llvm::SmallVector<int, 16> createReplicatedMask(unsigned ReplicationFactor,
		524	unsigned VF);
		525
		526	/// Create an interleave shuffle mask.
		527	///
		528	/// This function creates a shuffle mask for interleaving \p NumVecs vectors of
		529	/// vectorization factor \p VF into a single wide vector. The mask is of the
		530	/// form:
		531	///
		532	/// <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...>
		533	///
		534	/// For example, the mask for VF = 4 and NumVecs = 2 is:
		535	///
		536	/// <0, 4, 1, 5, 2, 6, 3, 7>.
		537	llvm::SmallVector<int, 16> createInterleaveMask(unsigned VF, unsigned NumVecs);
		538
		539	/// Create a stride shuffle mask.
		540	///
		541	/// This function creates a shuffle mask whose elements begin at \p Start and
		542	/// are incremented by \p Stride. The mask can be used to deinterleave an
		543	/// interleaved vector into separate vectors of vectorization factor \p VF. The
		544	/// mask is of the form:
		545	///
		546	/// <Start, Start + Stride, ..., Start + Stride * (VF - 1)>
		547	///
		548	/// For example, the mask for Start = 0, Stride = 2, and VF = 4 is:
		549	///
		550	/// <0, 2, 4, 6>
		551	llvm::SmallVector<int, 16> createStrideMask(unsigned Start, unsigned Stride,
		552	unsigned VF);
		553
		554	/// Create a sequential shuffle mask.
		555	///
		556	/// This function creates shuffle mask whose elements are sequential and begin
		557	/// at \p Start. The mask contains \p NumInts integers and is padded with \p
		558	/// NumUndefs undef values. The mask is of the form:
		559	///
		560	/// <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs>
		561	///
		562	/// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is:
		563	///
		564	/// <0, 1, 2, 3, undef, undef, undef, undef>
		565	llvm::SmallVector<int, 16>
		566	createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs);
		567
		568	/// Given a shuffle mask for a binary shuffle, create the equivalent shuffle
		569	/// mask assuming both operands are identical. This assumes that the unary
		570	/// shuffle will use elements from operand 0 (operand 1 will be unused).
		571	llvm::SmallVector<int, 16> createUnaryMask(ArrayRef<int> Mask,
		572	unsigned NumElts);
		573
		574	/// Concatenate a list of vectors.
		575	///
		576	/// This function generates code that concatenate the vectors in \p Vecs into a
		577	/// single large vector. The number of vectors should be greater than one, and
		578	/// their element types should be the same. The number of elements in the
		579	/// vectors should also be the same; however, if the last vector has fewer
		580	/// elements, it will be padded with undefs.
		581	Value concatenateVectors(IRBuilderBase &Builder, ArrayRef<Value > Vecs);
		582
		583	/// Given a mask vector of i1, Return true if all of the elements of this
		584	/// predicate mask are known to be false or undef. That is, return true if all
		585	/// lanes can be assumed inactive.
		586	bool maskIsAllZeroOrUndef(Value *Mask);
		587
		588	/// Given a mask vector of i1, Return true if all of the elements of this
		589	/// predicate mask are known to be true or undef. That is, return true if all
		590	/// lanes can be assumed active.
		591	bool maskIsAllOneOrUndef(Value *Mask);
		592
		593	/// Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y)
		594	/// for each lane which may be active.
		595	APInt possiblyDemandedEltsInMask(Value *Mask);
		596
		597	/// The group of interleaved loads/stores sharing the same stride and
		598	/// close to each other.
		599	///
		600	/// Each member in this group has an index starting from 0, and the largest
		601	/// index should be less than interleaved factor, which is equal to the absolute
		602	/// value of the access's stride.
		603	///
		604	/// E.g. An interleaved load group of factor 4:
		605	/// for (unsigned i = 0; i < 1024; i+=4) {
		606	/// a = A[i]; // Member of index 0
		607	/// b = A[i+1]; // Member of index 1
		608	/// d = A[i+3]; // Member of index 3
		609	/// ...
		610	/// }
		611	///
		612	/// An interleaved store group of factor 4:
		613	/// for (unsigned i = 0; i < 1024; i+=4) {
		614	/// ...
		615	/// A[i] = a; // Member of index 0
		616	/// A[i+1] = b; // Member of index 1
		617	/// A[i+2] = c; // Member of index 2
		618	/// A[i+3] = d; // Member of index 3
		619	/// }
		620	///
		621	/// Note: the interleaved load group could have gaps (missing members), but
		622	/// the interleaved store group doesn't allow gaps.
		623	template <typename InstTy> class InterleaveGroup {
		624	public:
		625	InterleaveGroup(uint32_t Factor, bool Reverse, Align Alignment)
		626	: Factor(Factor), Reverse(Reverse), Alignment(Alignment),
		627	InsertPos(nullptr) {}
		628
		629	InterleaveGroup(InstTy *Instr, int32_t Stride, Align Alignment)
		630	: Alignment(Alignment), InsertPos(Instr) {
		631	Factor = std::abs(Stride);
		632	assert(Factor > 1 && "Invalid interleave factor");
		633
		634	Reverse = Stride < 0;
		635	Members[0] = Instr;
		636	}
		637
		638	bool isReverse() const { return Reverse; }
		639	uint32_t getFactor() const { return Factor; }
		640	Align getAlign() const { return Alignment; }
		641	uint32_t getNumMembers() const { return Members.size(); }
		642
		643	/// Try to insert a new member \p Instr with index \p Index and
		644	/// alignment \p NewAlign. The index is related to the leader and it could be
		645	/// negative if it is the new leader.
		646	///
		647	/// \returns false if the instruction doesn't belong to the group.
		648	bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign) {
		649	// Make sure the key fits in an int32_t.
		650	std::optional<int32_t> MaybeKey = checkedAdd(Index, SmallestKey);
		651	if (!MaybeKey)
		652	return false;
		653	int32_t Key = *MaybeKey;
		654
		655	// Skip if the key is used for either the tombstone or empty special values.
		656	if (DenseMapInfo<int32_t>::getTombstoneKey() == Key \|\|
		657	DenseMapInfo<int32_t>::getEmptyKey() == Key)
		658	return false;
		659
		660	// Skip if there is already a member with the same index.
		661	if (Members.find(Key) != Members.end())
		662	return false;
		663
		664	if (Key > LargestKey) {
		665	// The largest index is always less than the interleave factor.
		666	if (Index >= static_cast<int32_t>(Factor))
		667	return false;
		668
		669	LargestKey = Key;
		670	} else if (Key < SmallestKey) {
		671
		672	// Make sure the largest index fits in an int32_t.
		673	std::optional<int32_t> MaybeLargestIndex = checkedSub(LargestKey, Key);
		674	if (!MaybeLargestIndex)
		675	return false;
		676
		677	// The largest index is always less than the interleave factor.
		678	if (*MaybeLargestIndex >= static_cast<int64_t>(Factor))
		679	return false;
		680
		681	SmallestKey = Key;
		682	}
		683
		684	// It's always safe to select the minimum alignment.
		685	Alignment = std::min(Alignment, NewAlign);
		686	Members[Key] = Instr;
		687	return true;
		688	}
		689
		690	/// Get the member with the given index \p Index
		691	///
		692	/// \returns nullptr if contains no such member.
		693	InstTy *getMember(uint32_t Index) const {
		694	int32_t Key = SmallestKey + Index;
		695	return Members.lookup(Key);
		696	}
		697
		698	/// Get the index for the given member. Unlike the key in the member
		699	/// map, the index starts from 0.
		700	uint32_t getIndex(const InstTy *Instr) const {
		701	for (auto I : Members) {
		702	if (I.second == Instr)
		703	return I.first - SmallestKey;
		704	}
		705
		706	llvm_unreachable("InterleaveGroup contains no such member");
		707	}
		708
		709	InstTy *getInsertPos() const { return InsertPos; }
		710	void setInsertPos(InstTy *Inst) { InsertPos = Inst; }
		711
		712	/// Add metadata (e.g. alias info) from the instructions in this group to \p
		713	/// NewInst.
		714	///
		715	/// FIXME: this function currently does not add noalias metadata a'la
		716	/// addNewMedata. To do that we need to compute the intersection of the
		717	/// noalias info from all members.
		718	void addMetadata(InstTy *NewInst) const;
		719
		720	/// Returns true if this Group requires a scalar iteration to handle gaps.
		721	bool requiresScalarEpilogue() const {
		722	// If the last member of the Group exists, then a scalar epilog is not
		723	// needed for this group.
		724	if (getMember(getFactor() - 1))
		725	return false;
		726
		727	// We have a group with gaps. It therefore can't be a reversed access,
		728	// because such groups get invalidated (TODO).
		729	assert(!isReverse() && "Group should have been invalidated");
		730
		731	// This is a group of loads, with gaps, and without a last-member
		732	return true;
		733	}
		734
		735	private:
		736	uint32_t Factor; // Interleave Factor.
		737	bool Reverse;
		738	Align Alignment;
		739	DenseMap<int32_t, InstTy *> Members;
		740	int32_t SmallestKey = 0;
		741	int32_t LargestKey = 0;
		742
		743	// To avoid breaking dependences, vectorized instructions of an interleave
		744	// group should be inserted at either the first load or the last store in
		745	// program order.
		746	//
		747	// E.g. %even = load i32 // Insert Position
		748	// %add = add i32 %even // Use of %even
		749	// %odd = load i32
		750	//
		751	// store i32 %even
		752	// %odd = add i32 // Def of %odd
		753	// store i32 %odd // Insert Position
		754	InstTy *InsertPos;
		755	};
		756
		757	/// Drive the analysis of interleaved memory accesses in the loop.
		758	///
		759	/// Use this class to analyze interleaved accesses only when we can vectorize
		760	/// a loop. Otherwise it's meaningless to do analysis as the vectorization
		761	/// on interleaved accesses is unsafe.
		762	///
		763	/// The analysis collects interleave groups and records the relationships
		764	/// between the member and the group in a map.
		765	class InterleavedAccessInfo {
		766	public:
		767	InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
		768	DominatorTree DT, LoopInfo LI,
		769	const LoopAccessInfo *LAI)
		770	: PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
		771
		772	~InterleavedAccessInfo() { invalidateGroups(); }
		773
		774	/// Analyze the interleaved accesses and collect them in interleave
		775	/// groups. Substitute symbolic strides using \p Strides.
		776	/// Consider also predicated loads/stores in the analysis if
		777	/// \p EnableMaskedInterleavedGroup is true.
		778	void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
		779
		780	/// Invalidate groups, e.g., in case all blocks in loop will be predicated
		781	/// contrary to original assumption. Although we currently prevent group
		782	/// formation for predicated accesses, we may be able to relax this limitation
		783	/// in the future once we handle more complicated blocks. Returns true if any
		784	/// groups were invalidated.
		785	bool invalidateGroups() {
		786	if (InterleaveGroups.empty()) {
		787	assert(
		788	!RequiresScalarEpilogue &&
		789	"RequiresScalarEpilog should not be set without interleave groups");
		790	return false;
		791	}
		792
		793	InterleaveGroupMap.clear();
		794	for (auto *Ptr : InterleaveGroups)
		795	delete Ptr;
		796	InterleaveGroups.clear();
		797	RequiresScalarEpilogue = false;
		798	return true;
		799	}
		800
		801	/// Check if \p Instr belongs to any interleave group.
		802	bool isInterleaved(Instruction *Instr) const {
		803	return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end();
		804	}
		805
		806	/// Get the interleave group that \p Instr belongs to.
		807	///
		808	/// \returns nullptr if doesn't have such group.
		809	InterleaveGroup<Instruction> *
		810	getInterleaveGroup(const Instruction *Instr) const {
		811	return InterleaveGroupMap.lookup(Instr);
		812	}
		813
		814	iterator_range<SmallPtrSetIterator<llvm::InterleaveGroup<Instruction> *>>
		815	getInterleaveGroups() {
		816	return make_range(InterleaveGroups.begin(), InterleaveGroups.end());
		817	}
		818
		819	/// Returns true if an interleaved group that may access memory
		820	/// out-of-bounds requires a scalar epilogue iteration for correctness.
		821	bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
		822
		823	/// Invalidate groups that require a scalar epilogue (due to gaps). This can
		824	/// happen when optimizing for size forbids a scalar epilogue, and the gap
		825	/// cannot be filtered by masking the load/store.
		826	void invalidateGroupsRequiringScalarEpilogue();
		827
		828	/// Returns true if we have any interleave groups.
		829	bool hasGroups() const { return !InterleaveGroups.empty(); }
		830
		831	private:
		832	/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
		833	/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
		834	/// The interleaved access analysis can also add new predicates (for example
		835	/// by versioning strides of pointers).
		836	PredicatedScalarEvolution &PSE;
		837
		838	Loop *TheLoop;
		839	DominatorTree *DT;
		840	LoopInfo *LI;
		841	const LoopAccessInfo *LAI;
		842
		843	/// True if the loop may contain non-reversed interleaved groups with
		844	/// out-of-bounds accesses. We ensure we don't speculatively access memory
		845	/// out-of-bounds by executing at least one scalar epilogue iteration.
		846	bool RequiresScalarEpilogue = false;
		847
		848	/// Holds the relationships between the members and the interleave group.
		849	DenseMap<Instruction , InterleaveGroup<Instruction> > InterleaveGroupMap;
		850
		851	SmallPtrSet<InterleaveGroup<Instruction> *, 4> InterleaveGroups;
		852
		853	/// Holds dependences among the memory accesses in the loop. It maps a source
		854	/// access to a set of dependent sink accesses.
		855	DenseMap<Instruction , SmallPtrSet<Instruction , 2>> Dependences;
		856
		857	/// The descriptor for a strided memory access.
		858	struct StrideDescriptor {
		859	StrideDescriptor() = default;
		860	StrideDescriptor(int64_t Stride, const SCEV *Scev, uint64_t Size,
		861	Align Alignment)
		862	: Stride(Stride), Scev(Scev), Size(Size), Alignment(Alignment) {}
		863
		864	// The access's stride. It is negative for a reverse access.
		865	int64_t Stride = 0;
		866
		867	// The scalar expression of this access.
		868	const SCEV *Scev = nullptr;
		869
		870	// The size of the memory object.
		871	uint64_t Size = 0;
		872
		873	// The alignment of this access.
		874	Align Alignment;
		875	};
		876
		877	/// A type for holding instructions and their stride descriptors.
		878	using StrideEntry = std::pair<Instruction *, StrideDescriptor>;
		879
		880	/// Create a new interleave group with the given instruction \p Instr,
		881	/// stride \p Stride and alignment \p Align.
		882	///
		883	/// \returns the newly created interleave group.
		884	InterleaveGroup<Instruction> *
		885	createInterleaveGroup(Instruction *Instr, int Stride, Align Alignment) {
		886	assert(!InterleaveGroupMap.count(Instr) &&
		887	"Already in an interleaved access group");
		888	InterleaveGroupMap[Instr] =
		889	new InterleaveGroup<Instruction>(Instr, Stride, Alignment);
		890	InterleaveGroups.insert(InterleaveGroupMap[Instr]);
		891	return InterleaveGroupMap[Instr];
		892	}
		893
		894	/// Release the group and remove all the relationships.
		895	void releaseGroup(InterleaveGroup<Instruction> *Group) {
		896	for (unsigned i = 0; i < Group->getFactor(); i++)
		897	if (Instruction *Member = Group->getMember(i))
		898	InterleaveGroupMap.erase(Member);
		899
		900	InterleaveGroups.erase(Group);
		901	delete Group;
		902	}
		903
		904	/// Collect all the accesses with a constant stride in program order.
		905	void collectConstStrideAccesses(
		906	MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
		907	const ValueToValueMap &Strides);
		908
		909	/// Returns true if \p Stride is allowed in an interleaved group.
		910	static bool isStrided(int Stride);
		911
		912	/// Returns true if \p BB is a predicated block.
		913	bool isPredicated(BasicBlock *BB) const {
		914	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
		915	}
		916
		917	/// Returns true if LoopAccessInfo can be used for dependence queries.
		918	bool areDependencesValid() const {
		919	return LAI && LAI->getDepChecker().getDependences();
		920	}
		921
		922	/// Returns true if memory accesses \p A and \p B can be reordered, if
		923	/// necessary, when constructing interleaved groups.
		924	///
		925	/// \p A must precede \p B in program order. We return false if reordering is
		926	/// not necessary or is prevented because \p A and \p B may be dependent.
		927	bool canReorderMemAccessesForInterleavedGroups(StrideEntry *A,
		928	StrideEntry *B) const {
		929	// Code motion for interleaved accesses can potentially hoist strided loads
		930	// and sink strided stores. The code below checks the legality of the
		931	// following two conditions:
		932	//
		933	// 1. Potentially moving a strided load (B) before any store (A) that
		934	// precedes B, or
		935	//
		936	// 2. Potentially moving a strided store (A) after any load or store (B)
		937	// that A precedes.
		938	//
		939	// It's legal to reorder A and B if we know there isn't a dependence from A
		940	// to B. Note that this determination is conservative since some
		941	// dependences could potentially be reordered safely.
		942
		943	// A is potentially the source of a dependence.
		944	auto *Src = A->first;
		945	auto SrcDes = A->second;
		946
		947	// B is potentially the sink of a dependence.
		948	auto *Sink = B->first;
		949	auto SinkDes = B->second;
		950
		951	// Code motion for interleaved accesses can't violate WAR dependences.
		952	// Thus, reordering is legal if the source isn't a write.
		953	if (!Src->mayWriteToMemory())
		954	return true;
		955
		956	// At least one of the accesses must be strided.
		957	if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
		958	return true;
		959
		960	// If dependence information is not available from LoopAccessInfo,
		961	// conservatively assume the instructions can't be reordered.
		962	if (!areDependencesValid())
		963	return false;
		964
		965	// If we know there is a dependence from source to sink, assume the
		966	// instructions can't be reordered. Otherwise, reordering is legal.
		967	return Dependences.find(Src) == Dependences.end() \|\|
		968	!Dependences.lookup(Src).count(Sink);
		969	}
		970
		971	/// Collect the dependences from LoopAccessInfo.
		972	///
		973	/// We process the dependences once during the interleaved access analysis to
		974	/// enable constant-time dependence queries.
		975	void collectDependences() {
		976	if (!areDependencesValid())
		977	return;
		978	auto *Deps = LAI->getDepChecker().getDependences();
		979	for (auto Dep : *Deps)
		980	Dependences[Dep.getSource(LAI)].insert(Dep.getDestination(LAI));
		981	}
		982	};
		983
		984	} // llvm namespace
		985
		986	#endif

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/include/llvm/Analysis/VectorUtils.h – Rev 14