39#include "debug/GPUAgentDisp.hh"
40#include "debug/GPUDisp.hh"
41#include "debug/GPUMem.hh"
42#include "debug/GPUShader.hh"
43#include "debug/GPUWgLatency.hh"
62 timingSim(
p.timing), hsail_mode(SIMT),
63 impl_kern_launch_acq(
p.impl_kern_launch_acq),
64 impl_kern_end_rel(
p.impl_kern_end_rel),
66 trace_vgpr_all(1), n_cu((
p.CUs).size()), n_wf(
p.n_wf),
67 n_cu_per_sqc(
p.cu_per_sqc),
68 globalMemSize(
p.globalmem),
69 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p.gpu_cmd_proc),
70 _dispatcher(*
p.dispatcher), systemHub(
p.system_hub),
71 max_valu_insts(
p.max_valu_insts), total_valu_insts(0),
72 progressInterval(
p.progress_interval),
73 stats(
this,
p.CUs[0]->wfSize())
75 gpuCmdProc.setShader(
this);
76 _dispatcher.setShader(
this);
81 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000L;
82 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
84 _ldsApe.base = 0x1000000000000;
85 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
87 _scratchApe.base = 0x2000000000000;
88 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
97 shHiddenPrivateBaseVmid = 0;
101 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
103 for (
int i = 0;
i < n_cu; ++
i) {
104 cuList[
i] =
p.CUs[
i];
105 assert(
i == cuList[
i]->cu_id);
106 cuList[
i]->shader =
this;
107 cuList[
i]->idleCUTimeout =
p.idlecu_timeout;
130 DPRINTF(GPUShader,
"GROWS DOWN");
131 start = mem_state->getMmapEnd() - length;
132 mem_state->setMmapEnd(start);
134 DPRINTF(GPUShader,
"GROWS UP");
135 start = mem_state->getMmapEnd();
136 mem_state->setMmapEnd(start + length);
139 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
140 mem_state->getMmapEnd());
143 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start, length);
161 for (
int j = 0; j <
n_cu; ++j)
179 for (
int i = 0;
i <
sa_n; ++
i) {
191 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
193 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
196 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
215 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
218 auto tcc_req = std::make_shared<Request>(0, 0, 0,
219 cuList[i_cu]->requestorId(),
229 auto sqc_req = std::make_shared<Request>(0, 0, 0,
230 cuList[i_cu]->requestorId(),
238 cuList[i_cu]->resetRegisterPool();
247 int kernId = gpuDynInst->kern_id;
249 assert(
_dispatcher.getOutstandingWbs(kernId) == 0);
255 cuList[i_cu]->doFlush(gpuDynInst);
261 bool scheduledSomething =
false;
266 while (cuCount <
n_cu) {
273 int num_wfs_in_wg = 0;
274 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
276 scheduledSomething =
true;
277 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
279 DPRINTF(GPUAgentDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
281 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
291 "Invalid activeCu size\n");
292 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
302 DPRINTF(GPUWgLatency,
"Shader Dispatched %d Wgs\n", disp_count);
304 return scheduledSomething;
309 bool suppress_func_errors,
int cu_id)
311 int block_size =
cuList.at(cu_id)->cacheLineSize();
312 unsigned size = req->getSize();
322 fatal(
"unexcepted MemCmd\n");
325 tmp_addr = req->getVaddr();
326 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
328 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
331 if (split_addr > tmp_addr) {
333 req->splitOnVaddr(split_addr, req1, req2);
347 if (suppress_func_errors) {
354 cuList[0]->memPort[0].sendFunctional(new_pkt1);
355 cuList[0]->memPort[0].sendFunctional(new_pkt2);
367 if (suppress_func_errors) {
373 cuList[0]->memPort[0].sendFunctional(new_pkt);
389 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
394 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
401 MemCmd cmd,
bool suppress_func_errors)
403 uint8_t *data_buf = (uint8_t*)ptr;
406 !gen.
done(); gen.next()) {
409 gen.addr(), gen.size(), 0,
410 cuList[0]->requestorId(), 0, 0,
nullptr);
413 data_buf += gen.size();
425 bool suppress_func_errors)
428 suppress_func_errors);
439 bool suppress_func_errors)
442 suppress_func_errors);
461 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
477 stats.storeLatencyDist.sample(accessTime);
478 stats.allLatencyDist.sample(accessTime);
487 stats.loadLatencyDist.sample(accessTime);
488 stats.allLatencyDist.sample(accessTime);
499 Tick t1 = roundTripTime[0];
500 Tick t2 = roundTripTime[1];
501 Tick t3 = roundTripTime[2];
502 Tick t4 = roundTripTime[3];
503 Tick t5 = roundTripTime[4];
505 stats.initToCoalesceLatency.sample(
t2-
t1);
514 stats.coalsrLineAddresses.sample(lineMap.size());
519 for (
auto& it : lineMap) {
521 if (timeVec.size() == 2) {
522 netTimes.push_back(timeVec[1] - timeVec[0]);
529 std::sort(netTimes.begin(), netTimes.end());
534 for (
auto& time : netTimes) {
535 stats.cacheBlockRoundTrip[idx].sample(time);
544 "Invalid activeCu size\n");
567 gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
568 std::get<1>(dispatch),
569 std::get<2>(dispatch));
580 std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
604 "Ticks from vmem inst initiateAcc to coalescer issue"),
606 "Ticks from coalescer issue to coalescer hit callback"),
608 "Ticks from coalescer hit callback to GM pipe enqueue"),
610 "Ticks queued in GM pipes ordered response buffer"),
612 "Number of cache lines for coalesced request"),
614 "Total ticks that any CU attached to this shader is active"),
616 "vector instruction source operand distribution"),
618 "vector instruction destination operand distribution")
621 .init(0, 1600000-1, 10000)
625 .init(0, 1600000-1, 10000)
629 .init(0, 1600000-1, 10000)
633 .init(0, 1600000-1, 10000)
637 .init(0, 1600000-1, 10000)
641 .init(0, 1600000-1, 10000)
645 .init(0, 1600000-1, 10000)
656 for (
int idx = 0; idx < wf_size; ++idx) {
657 std::stringstream namestr;
658 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
661 .init(0, 1600000-1, 10000)
663 .desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
Declaration and inline definition of ChunkGenerator object.
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
ClockedObject(const ClockedObjectParams &p)
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
bool dispComplete() const
virtual std::string name() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
RequestPtr req
A pointer to the original request.
void setSuppressFuncError()
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd?
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
std::shared_ptr< MemState > memState
void prepareInvalidate(HSAQueueEntry *task)
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
GfxVersion getGfxVersion() const
EventFunctionWrapper tickEvent
std::vector< ComputeUnit * > cuList
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void ScheduleAdd(int *val, Tick when, int x)
GPUDispatcher & _dispatcher
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
std::vector< uint64_t > sa_when
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
std::vector< int32_t > sa_x
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
gem5::Shader::ShaderStats stats
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
bool dispatchWorkgroups(HSAQueueEntry *task)
GPUDispatcher & dispatcher()
void decNumOutstandingInvL2s()
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
void updateContext(int cid)
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
void sampleLoad(const Tick accessTime)
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
int num_outstanding_invl2s
void sampleStore(const Tick accessTime)
GPUCommandProcessor & gpuCmdProc
std::vector< int * > sa_val
A simple distribution stat.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
bool done() const
Are we done?
void schedule(Event &event, Tick when)
void reschedule(Event &event, Tick when, bool always=false)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
const FlagsType pdf
Print the percent of the total that this entry represents.
const FlagsType oneline
Print all values on a single line.
Copyright (c) 2024 Arm Limited All rights reserved.
T safe_cast(U &&ref_or_ptr)
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
uint64_t Tick
Tick count type.
void ccprintf(cp::Print &print)
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
statistics::Distribution storeLatencyDist
statistics::Distribution initToCoalesceLatency
statistics::Scalar shaderActiveTicks
statistics::Distribution loadLatencyDist
statistics::Distribution allLatencyDist
statistics::Distribution gmToCompleteLatency
ShaderStats(statistics::Group *parent, int wf_size)
statistics::Distribution coalsrLineAddresses
statistics::Vector vectorInstDstOperand
statistics::Distribution rubyNetworkLatency
statistics::Distribution * cacheBlockRoundTrip
statistics::Distribution gmEnqueueLatency