39#include "debug/GPUAgentDisp.hh"
40#include "debug/GPUDisp.hh"
41#include "debug/GPUMem.hh"
42#include "debug/GPUShader.hh"
43#include "debug/GPUWgLatency.hh"
58 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
59 gpuTc(nullptr), cpuPointer(
p.cpu_pointer),
62 timingSim(
p.timing), hsail_mode(SIMT),
63 impl_kern_launch_acq(
p.impl_kern_launch_acq),
64 impl_kern_end_rel(
p.impl_kern_end_rel),
66 trace_vgpr_all(1), n_cu((
p.CUs).size()), n_wf(
p.n_wf),
67 n_cu_per_sqc(
p.cu_per_sqc),
68 globalMemSize(
p.globalmem),
69 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p.gpu_cmd_proc),
70 _dispatcher(*
p.dispatcher), systemHub(
p.system_hub),
71 max_valu_insts(
p.max_valu_insts), total_valu_insts(0),
72 stats(
this,
p.CUs[0]->wfSize())
74 gpuCmdProc.setShader(
this);
75 _dispatcher.setShader(
this);
80 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000L;
81 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
83 _ldsApe.base = 0x1000000000000;
84 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
86 _scratchApe.base = 0x2000000000000;
87 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
96 shHiddenPrivateBaseVmid = 0;
100 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
102 for (
int i = 0;
i < n_cu; ++
i) {
103 cuList[
i] =
p.CUs[
i];
104 assert(
i == cuList[
i]->cu_id);
105 cuList[
i]->shader =
this;
106 cuList[
i]->idleCUTimeout =
p.idlecu_timeout;
129 DPRINTF(GPUShader,
"GROWS DOWN");
130 start = mem_state->getMmapEnd() - length;
131 mem_state->setMmapEnd(start);
133 DPRINTF(GPUShader,
"GROWS UP");
134 start = mem_state->getMmapEnd();
135 mem_state->setMmapEnd(start + length);
138 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
139 mem_state->getMmapEnd());
142 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start, length);
160 for (
int j = 0; j <
n_cu; ++j)
178 for (
int i = 0;
i <
sa_n; ++
i) {
190 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
192 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
195 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
214 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
217 auto req = std::make_shared<Request>(0, 0, 0,
218 cuList[i_cu]->requestorId(),
233 cuList[i_cu]->resetRegisterPool();
242 int kernId = gpuDynInst->kern_id;
250 cuList[i_cu]->doFlush(gpuDynInst);
256 bool scheduledSomething =
false;
261 while (cuCount <
n_cu) {
268 int num_wfs_in_wg = 0;
269 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
271 scheduledSomething =
true;
272 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
274 DPRINTF(GPUAgentDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
276 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
286 "Invalid activeCu size\n");
287 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
297 DPRINTF(GPUWgLatency,
"Shader Dispatched %d Wgs\n", disp_count);
299 return scheduledSomething;
304 bool suppress_func_errors,
int cu_id)
306 int block_size =
cuList.at(cu_id)->cacheLineSize();
307 unsigned size = req->getSize();
317 fatal(
"unexcepted MemCmd\n");
320 tmp_addr = req->getVaddr();
321 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
323 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
326 if (split_addr > tmp_addr) {
328 req->splitOnVaddr(split_addr, req1, req2);
342 if (suppress_func_errors) {
349 cuList[0]->memPort[0].sendFunctional(new_pkt1);
350 cuList[0]->memPort[0].sendFunctional(new_pkt2);
362 if (suppress_func_errors) {
368 cuList[0]->memPort[0].sendFunctional(new_pkt);
384 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
389 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
396 MemCmd cmd,
bool suppress_func_errors)
398 uint8_t *data_buf = (uint8_t*)ptr;
401 !gen.
done(); gen.next()) {
404 gen.addr(), gen.size(), 0,
405 cuList[0]->requestorId(), 0, 0,
nullptr);
408 data_buf += gen.size();
420 bool suppress_func_errors)
423 suppress_func_errors);
434 bool suppress_func_errors)
437 suppress_func_errors);
456 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
494 Tick t1 = roundTripTime[0];
495 Tick t2 = roundTripTime[1];
496 Tick t3 = roundTripTime[2];
497 Tick t4 = roundTripTime[3];
498 Tick t5 = roundTripTime[4];
514 for (
auto& it : lineMap) {
516 if (timeVec.size() == 2) {
517 netTimes.push_back(timeVec[1] - timeVec[0]);
524 std::sort(netTimes.begin(), netTimes.end());
529 for (
auto& time : netTimes) {
539 "Invalid activeCu size\n");
563 std::get<1>(dispatch),
564 std::get<2>(dispatch));
575 std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
588 : statistics::
Group(parent),
589 ADD_STAT(allLatencyDist,
"delay distribution for all"),
590 ADD_STAT(loadLatencyDist,
"delay distribution for loads"),
591 ADD_STAT(storeLatencyDist,
"delay distribution for stores"),
593 "Ticks from vmem inst initiateAcc to coalescer issue"),
595 "Ticks from coalescer issue to coalescer hit callback"),
597 "Ticks from coalescer hit callback to GM pipe enqueue"),
599 "Ticks queued in GM pipes ordered response buffer"),
601 "Number of cache lines for coalesced request"),
603 "Total ticks that any CU attached to this shader is active"),
605 "vector instruction source operand distribution"),
607 "vector instruction destination operand distribution")
610 .
init(0, 1600000-1, 10000)
614 .
init(0, 1600000-1, 10000)
618 .
init(0, 1600000-1, 10000)
622 .
init(0, 1600000-1, 10000)
626 .
init(0, 1600000-1, 10000)
630 .
init(0, 1600000-1, 10000)
634 .
init(0, 1600000-1, 10000)
645 for (
int idx = 0; idx < wf_size; ++idx) {
646 std::stringstream namestr;
647 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
650 .
init(0, 1600000-1, 10000)
652 .
desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
Declaration and inline definition of ChunkGenerator object.
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
bool dispComplete() const
virtual std::string name() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
RequestPtr req
A pointer to the original request.
void setSuppressFuncError()
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
std::shared_ptr< MemState > memState
void prepareInvalidate(HSAQueueEntry *task)
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
EventFunctionWrapper tickEvent
std::vector< ComputeUnit * > cuList
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void ScheduleAdd(int *val, Tick when, int x)
GPUDispatcher & _dispatcher
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
std::vector< uint64_t > sa_when
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
std::vector< int32_t > sa_x
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
gem5::Shader::ShaderStats stats
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
bool dispatchWorkgroups(HSAQueueEntry *task)
GPUDispatcher & dispatcher()
void decNumOutstandingInvL2s()
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
void updateContext(int cid)
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
void sampleLoad(const Tick accessTime)
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
int num_outstanding_invl2s
void sampleStore(const Tick accessTime)
GPUCommandProcessor & gpuCmdProc
std::vector< int * > sa_val
virtual Process * getProcessPtr()=0
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
A simple distribution stat.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Derived & init(size_type size)
Set this vector to have the given size.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool scheduled() const
Determine if the current event is scheduled.
void schedule(Event &event, Tick when)
void reschedule(Event &event, Tick when, bool always=false)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Tick when() const
Get the time that the event is scheduled.
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
const FlagsType pdf
Print the percent of the total that this entry represents.
const FlagsType oneline
Print all values on a single line.
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
T safe_cast(U &&ref_or_ptr)
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
uint64_t Tick
Tick count type.
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
void ccprintf(cp::Print &print)
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
statistics::Distribution storeLatencyDist
statistics::Distribution initToCoalesceLatency
statistics::Scalar shaderActiveTicks
statistics::Distribution loadLatencyDist
statistics::Distribution allLatencyDist
statistics::Distribution gmToCompleteLatency
ShaderStats(statistics::Group *parent, int wf_size)
statistics::Distribution coalsrLineAddresses
statistics::Vector vectorInstDstOperand
statistics::Distribution rubyNetworkLatency
statistics::Distribution * cacheBlockRoundTrip
statistics::Distribution gmEnqueueLatency