WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/include/llvm/MC/MCSchedule.h

Rev	Author	Line No.	Line
14	pmbaty	1	//===-- llvm/MC/MCSchedule.h - Scheduling ------------------------ C++ --===//
		2	//
		3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	// See https://llvm.org/LICENSE.txt for license information.
		5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	//
		7	//===----------------------------------------------------------------------===//
		8	//
		9	// This file defines the classes used to describe a subtarget's machine model
		10	// for scheduling and other instruction cost heuristics.
		11	//
		12	//===----------------------------------------------------------------------===//
		13
		14	#ifndef LLVM_MC_MCSCHEDULE_H
		15	#define LLVM_MC_MCSCHEDULE_H
		16
		17	#include "llvm/Config/llvm-config.h"
		18	#include "llvm/Support/DataTypes.h"
		19	#include <cassert>
		20
		21	namespace llvm {
		22
		23	template <typename T> class ArrayRef;
		24	struct InstrItinerary;
		25	class MCSubtargetInfo;
		26	class MCInstrInfo;
		27	class MCInst;
		28	class InstrItineraryData;
		29
		30	/// Define a kind of processor resource that will be modeled by the scheduler.
		31	struct MCProcResourceDesc {
		32	const char *Name;
		33	unsigned NumUnits; // Number of resource of this kind
		34	unsigned SuperIdx; // Index of the resources kind that contains this kind.
		35
		36	// Number of resources that may be buffered.
		37	//
		38	// Buffered resources (BufferSize != 0) may be consumed at some indeterminate
		39	// cycle after dispatch. This should be used for out-of-order cpus when
		40	// instructions that use this resource can be buffered in a reservaton
		41	// station.
		42	//
		43	// Unbuffered resources (BufferSize == 0) always consume their resource some
		44	// fixed number of cycles after dispatch. If a resource is unbuffered, then
		45	// the scheduler will avoid scheduling instructions with conflicting resources
		46	// in the same cycle. This is for in-order cpus, or the in-order portion of
		47	// an out-of-order cpus.
		48	int BufferSize;
		49
		50	// If the resource has sub-units, a pointer to the first element of an array
		51	// of `NumUnits` elements containing the ProcResourceIdx of the sub units.
		52	// nullptr if the resource does not have sub-units.
		53	const unsigned *SubUnitsIdxBegin;
		54
		55	bool operator==(const MCProcResourceDesc &Other) const {
		56	return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
		57	&& BufferSize == Other.BufferSize;
		58	}
		59	};
		60
		61	/// Identify one of the processor resource kinds consumed by a particular
		62	/// scheduling class for the specified number of cycles.
		63	struct MCWriteProcResEntry {
		64	uint16_t ProcResourceIdx;
		65	uint16_t Cycles;
		66
		67	bool operator==(const MCWriteProcResEntry &Other) const {
		68	return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
		69	}
		70	};
		71
		72	/// Specify the latency in cpu cycles for a particular scheduling class and def
		73	/// index. -1 indicates an invalid latency. Heuristics would typically consider
		74	/// an instruction with invalid latency to have infinite latency. Also identify
		75	/// the WriteResources of this def. When the operand expands to a sequence of
		76	/// writes, this ID is the last write in the sequence.
		77	struct MCWriteLatencyEntry {
		78	int16_t Cycles;
		79	uint16_t WriteResourceID;
		80
		81	bool operator==(const MCWriteLatencyEntry &Other) const {
		82	return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
		83	}
		84	};
		85
		86	/// Specify the number of cycles allowed after instruction issue before a
		87	/// particular use operand reads its registers. This effectively reduces the
		88	/// write's latency. Here we allow negative cycles for corner cases where
		89	/// latency increases. This rule only applies when the entry's WriteResource
		90	/// matches the write's WriteResource.
		91	///
		92	/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
		93	/// WriteResourceIdx.
		94	struct MCReadAdvanceEntry {
		95	unsigned UseIdx;
		96	unsigned WriteResourceID;
		97	int Cycles;
		98
		99	bool operator==(const MCReadAdvanceEntry &Other) const {
		100	return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
		101	&& Cycles == Other.Cycles;
		102	}
		103	};
		104
		105	/// Summarize the scheduling resources required for an instruction of a
		106	/// particular scheduling class.
		107	///
		108	/// Defined as an aggregate struct for creating tables with initializer lists.
		109	struct MCSchedClassDesc {
		110	static const unsigned short InvalidNumMicroOps = (1U << 13) - 1;
		111	static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
		112
		113	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
		114	const char* Name;
		115	#endif
		116	uint16_t NumMicroOps : 13;
		117	uint16_t BeginGroup : 1;
		118	uint16_t EndGroup : 1;
		119	uint16_t RetireOOO : 1;
		120	uint16_t WriteProcResIdx; // First index into WriteProcResTable.
		121	uint16_t NumWriteProcResEntries;
		122	uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
		123	uint16_t NumWriteLatencyEntries;
		124	uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
		125	uint16_t NumReadAdvanceEntries;
		126
		127	bool isValid() const {
		128	return NumMicroOps != InvalidNumMicroOps;
		129	}
		130	bool isVariant() const {
		131	return NumMicroOps == VariantNumMicroOps;
		132	}
		133	};
		134
		135	/// Specify the cost of a register definition in terms of number of physical
		136	/// register allocated at register renaming stage. For example, AMD Jaguar.
		137	/// natively supports 128-bit data types, and operations on 256-bit registers
		138	/// (i.e. YMM registers) are internally split into two COPs (complex operations)
		139	/// and each COP updates a physical register. Basically, on Jaguar, a YMM
		140	/// register write effectively consumes two physical registers. That means,
		141	/// the cost of a YMM write in the BtVer2 model is 2.
		142	struct MCRegisterCostEntry {
		143	unsigned RegisterClassID;
		144	unsigned Cost;
		145	bool AllowMoveElimination;
		146	};
		147
		148	/// A register file descriptor.
		149	///
		150	/// This struct allows to describe processor register files. In particular, it
		151	/// helps describing the size of the register file, as well as the cost of
		152	/// allocating a register file at register renaming stage.
		153	/// FIXME: this struct can be extended to provide information about the number
		154	/// of read/write ports to the register file. A value of zero for field
		155	/// 'NumPhysRegs' means: this register file has an unbounded number of physical
		156	/// registers.
		157	struct MCRegisterFileDesc {
		158	const char *Name;
		159	uint16_t NumPhysRegs;
		160	uint16_t NumRegisterCostEntries;
		161	// Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
		162	uint16_t RegisterCostEntryIdx;
		163	// A value of zero means: there is no limit in the number of moves that can be
		164	// eliminated every cycle.
		165	uint16_t MaxMovesEliminatedPerCycle;
		166	// Ture if this register file only knows how to optimize register moves from
		167	// known zero registers.
		168	bool AllowZeroMoveEliminationOnly;
		169	};
		170
		171	/// Provide extra details about the machine processor.
		172	///
		173	/// This is a collection of "optional" processor information that is not
		174	/// normally used by the LLVM machine schedulers, but that can be consumed by
		175	/// external tools like llvm-mca to improve the quality of the peformance
		176	/// analysis.
		177	struct MCExtraProcessorInfo {
		178	// Actual size of the reorder buffer in hardware.
		179	unsigned ReorderBufferSize;
		180	// Number of instructions retired per cycle.
		181	unsigned MaxRetirePerCycle;
		182	const MCRegisterFileDesc *RegisterFiles;
		183	unsigned NumRegisterFiles;
		184	const MCRegisterCostEntry *RegisterCostTable;
		185	unsigned NumRegisterCostEntries;
		186	unsigned LoadQueueID;
		187	unsigned StoreQueueID;
		188	};
		189
		190	/// Machine model for scheduling, bundling, and heuristics.
		191	///
		192	/// The machine model directly provides basic information about the
		193	/// microarchitecture to the scheduler in the form of properties. It also
		194	/// optionally refers to scheduler resource tables and itinerary
		195	/// tables. Scheduler resource tables model the latency and cost for each
		196	/// instruction type. Itinerary tables are an independent mechanism that
		197	/// provides a detailed reservation table describing each cycle of instruction
		198	/// execution. Subtargets may define any or all of the above categories of data
		199	/// depending on the type of CPU and selected scheduler.
		200	///
		201	/// The machine independent properties defined here are used by the scheduler as
		202	/// an abstract machine model. A real micro-architecture has a number of
		203	/// buffers, queues, and stages. Declaring that a given machine-independent
		204	/// abstract property corresponds to a specific physical property across all
		205	/// subtargets can't be done. Nonetheless, the abstract model is
		206	/// useful. Futhermore, subtargets typically extend this model with processor
		207	/// specific resources to model any hardware features that can be exploited by
		208	/// scheduling heuristics and aren't sufficiently represented in the abstract.
		209	///
		210	/// The abstract pipeline is built around the notion of an "issue point". This
		211	/// is merely a reference point for counting machine cycles. The physical
		212	/// machine will have pipeline stages that delay execution. The scheduler does
		213	/// not model those delays because they are irrelevant as long as they are
		214	/// consistent. Inaccuracies arise when instructions have different execution
		215	/// delays relative to each other, in addition to their intrinsic latency. Those
		216	/// special cases can be handled by TableGen constructs such as, ReadAdvance,
		217	/// which reduces latency when reading data, and ResourceCycles, which consumes
		218	/// a processor resource when writing data for a number of abstract
		219	/// cycles.
		220	///
		221	/// TODO: One tool currently missing is the ability to add a delay to
		222	/// ResourceCycles. That would be easy to add and would likely cover all cases
		223	/// currently handled by the legacy itinerary tables.
		224	///
		225	/// A note on out-of-order execution and, more generally, instruction
		226	/// buffers. Part of the CPU pipeline is always in-order. The issue point, which
		227	/// is the point of reference for counting cycles, only makes sense as an
		228	/// in-order part of the pipeline. Other parts of the pipeline are sometimes
		229	/// falling behind and sometimes catching up. It's only interesting to model
		230	/// those other, decoupled parts of the pipeline if they may be predictably
		231	/// resource constrained in a way that the scheduler can exploit.
		232	///
		233	/// The LLVM machine model distinguishes between in-order constraints and
		234	/// out-of-order constraints so that the target's scheduling strategy can apply
		235	/// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
		236	/// resources would not typically be treated as a hard scheduling
		237	/// constraint. For example, in the GenericScheduler, a delay caused by limited
		238	/// out-of-order resources is not directly reflected in the number of cycles
		239	/// that the scheduler sees between issuing an instruction and its dependent
		240	/// instructions. In other words, out-of-order resources don't directly increase
		241	/// the latency between pairs of instructions. However, they can still be used
		242	/// to detect potential bottlenecks across a sequence of instructions and bias
		243	/// the scheduling heuristics appropriately.
		244	struct MCSchedModel {
		245	// IssueWidth is the maximum number of instructions that may be scheduled in
		246	// the same per-cycle group. This is meant to be a hard in-order constraint
		247	// (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
		248	// IssueWidth micro-ops can ever be scheduled in a particular cycle.
		249	//
		250	// In practice, IssueWidth is useful to model any bottleneck between the
		251	// decoder (after micro-op expansion) and the out-of-order reservation
		252	// stations or the decoder bandwidth itself. If the total number of
		253	// reservation stations is also a bottleneck, or if any other pipeline stage
		254	// has a bandwidth limitation, then that can be naturally modeled by adding an
		255	// out-of-order processor resource.
		256	unsigned IssueWidth;
		257	static const unsigned DefaultIssueWidth = 1;
		258
		259	// MicroOpBufferSize is the number of micro-ops that the processor may buffer
		260	// for out-of-order execution.
		261	//
		262	// "0" means operations that are not ready in this cycle are not considered
		263	// for scheduling (they go in the pending queue). Latency is paramount. This
		264	// may be more efficient if many instructions are pending in a schedule.
		265	//
		266	// "1" means all instructions are considered for scheduling regardless of
		267	// whether they are ready in this cycle. Latency still causes issue stalls,
		268	// but we balance those stalls against other heuristics.
		269	//
		270	// "> 1" means the processor is out-of-order. This is a machine independent
		271	// estimate of highly machine specific characteristics such as the register
		272	// renaming pool and reorder buffer.
		273	unsigned MicroOpBufferSize;
		274	static const unsigned DefaultMicroOpBufferSize = 0;
		275
		276	// LoopMicroOpBufferSize is the number of micro-ops that the processor may
		277	// buffer for optimized loop execution. More generally, this represents the
		278	// optimal number of micro-ops in a loop body. A loop may be partially
		279	// unrolled to bring the count of micro-ops in the loop body closer to this
		280	// number.
		281	unsigned LoopMicroOpBufferSize;
		282	static const unsigned DefaultLoopMicroOpBufferSize = 0;
		283
		284	// LoadLatency is the expected latency of load instructions.
		285	unsigned LoadLatency;
		286	static const unsigned DefaultLoadLatency = 4;
		287
		288	// HighLatency is the expected latency of "very high latency" operations.
		289	// See TargetInstrInfo::isHighLatencyDef().
		290	// By default, this is set to an arbitrarily high number of cycles
		291	// likely to have some impact on scheduling heuristics.
		292	unsigned HighLatency;
		293	static const unsigned DefaultHighLatency = 10;
		294
		295	// MispredictPenalty is the typical number of extra cycles the processor
		296	// takes to recover from a branch misprediction.
		297	unsigned MispredictPenalty;
		298	static const unsigned DefaultMispredictPenalty = 10;
		299
		300	bool PostRAScheduler; // default value is false
		301
		302	bool CompleteModel;
		303
		304	unsigned ProcID;
		305	const MCProcResourceDesc *ProcResourceTable;
		306	const MCSchedClassDesc *SchedClassTable;
		307	unsigned NumProcResourceKinds;
		308	unsigned NumSchedClasses;
		309	// Instruction itinerary tables used by InstrItineraryData.
		310	friend class InstrItineraryData;
		311	const InstrItinerary *InstrItineraries;
		312
		313	const MCExtraProcessorInfo *ExtraProcessorInfo;
		314
		315	bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
		316
		317	unsigned getProcessorID() const { return ProcID; }
		318
		319	/// Does this machine model include instruction-level scheduling.
		320	bool hasInstrSchedModel() const { return SchedClassTable; }
		321
		322	const MCExtraProcessorInfo &getExtraProcessorInfo() const {
		323	assert(hasExtraProcessorInfo() &&
		324	"No extra information available for this model");
		325	return *ExtraProcessorInfo;
		326	}
		327
		328	/// Return true if this machine model data for all instructions with a
		329	/// scheduling class (itinerary class or SchedRW list).
		330	bool isComplete() const { return CompleteModel; }
		331
		332	/// Return true if machine supports out of order execution.
		333	bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
		334
		335	unsigned getNumProcResourceKinds() const {
		336	return NumProcResourceKinds;
		337	}
		338
		339	const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
		340	assert(hasInstrSchedModel() && "No scheduling machine model");
		341
		342	assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
		343	return &ProcResourceTable[ProcResourceIdx];
		344	}
		345
		346	const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
		347	assert(hasInstrSchedModel() && "No scheduling machine model");
		348
		349	assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
		350	return &SchedClassTable[SchedClassIdx];
		351	}
		352
		353	/// Returns the latency value for the scheduling class.
		354	static int computeInstrLatency(const MCSubtargetInfo &STI,
		355	const MCSchedClassDesc &SCDesc);
		356
		357	int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
		358	int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
		359	const MCInst &Inst) const;
		360
		361	// Returns the reciprocal throughput information from a MCSchedClassDesc.
		362	static double
		363	getReciprocalThroughput(const MCSubtargetInfo &STI,
		364	const MCSchedClassDesc &SCDesc);
		365
		366	static double
		367	getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
		368
		369	double
		370	getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
		371	const MCInst &Inst) const;
		372
		373	/// Returns the maximum forwarding delay for register reads dependent on
		374	/// writes of scheduling class WriteResourceIdx.
		375	static unsigned getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
		376	unsigned WriteResourceIdx = 0);
		377
		378	/// Returns the default initialized model.
		379	static const MCSchedModel &GetDefaultSchedModel() { return Default; }
		380	static const MCSchedModel Default;
		381	};
		382
		383	} // namespace llvm
		384
		385	#endif

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/include/llvm/MC/MCSchedule.h – Rev 14