Go to the documentation of this file.
39 #include "debug/GPUAgentDisp.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUMem.hh"
42 #include "debug/GPUShader.hh"
43 #include "debug/GPUWgLatency.hh"
57 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
58 gpuTc(nullptr), cpuPointer(
p.cpu_pointer),
61 timingSim(
p.timing), hsail_mode(SIMT),
62 impl_kern_launch_acq(
p.impl_kern_launch_acq),
63 impl_kern_end_rel(
p.impl_kern_end_rel),
65 trace_vgpr_all(1), n_cu((
p.CUs).size()), n_wf(
p.n_wf),
66 globalMemSize(
p.globalmem),
67 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p.gpu_cmd_proc),
68 _dispatcher(*
p.dispatcher), systemHub(
p.system_hub),
69 max_valu_insts(
p.max_valu_insts), total_valu_insts(0),
70 stats(
this,
p.CUs[0]->wfSize())
72 gpuCmdProc.setShader(
this);
73 _dispatcher.setShader(
this);
75 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000
L;
76 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
78 _ldsApe.base = ((
Addr)1 << 61) + 0x0;
79 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
81 _scratchApe.base = ((
Addr)1 << 61) + 0x100000000
L;
82 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
84 shHiddenPrivateBaseVmid = 0;
88 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
90 for (
int i = 0;
i < n_cu; ++
i) {
92 assert(
i == cuList[
i]->cu_id);
93 cuList[
i]->shader =
this;
94 cuList[
i]->idleCUTimeout =
p.idlecu_timeout;
117 DPRINTF(GPUShader,
"GROWS DOWN");
118 start = mem_state->getMmapEnd() - length;
119 mem_state->setMmapEnd(start);
121 DPRINTF(GPUShader,
"GROWS UP");
122 start = mem_state->getMmapEnd();
123 mem_state->setMmapEnd(start + length);
126 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
127 mem_state->getMmapEnd());
130 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start, length);
166 for (
int i = 0;
i <
sa_n; ++
i) {
178 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
180 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
183 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
202 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
205 auto req = std::make_shared<Request>(0, 0, 0,
206 cuList[i_cu]->requestorId(),
214 cuList[i_cu]->resetRegisterPool();
223 int kernId = gpuDynInst->kern_id;
231 cuList[i_cu]->doFlush(gpuDynInst);
237 bool scheduledSomething =
false;
242 while (cuCount <
n_cu) {
249 int num_wfs_in_wg = 0;
250 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
252 scheduledSomething =
true;
253 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
255 DPRINTF(GPUAgentDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
257 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
267 "Invalid activeCu size\n");
268 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
278 DPRINTF(GPUWgLatency,
"Shader Dispatched %d Wgs\n", disp_count);
280 return scheduledSomething;
285 bool suppress_func_errors,
int cu_id)
287 int block_size =
cuList.at(cu_id)->cacheLineSize();
288 unsigned size = req->getSize();
298 fatal(
"unexcepted MemCmd\n");
301 tmp_addr = req->getVaddr();
302 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
304 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
307 if (split_addr > tmp_addr) {
309 req->splitOnVaddr(split_addr, req1, req2);
323 if (suppress_func_errors) {
330 cuList[0]->memPort[0].sendFunctional(new_pkt1);
331 cuList[0]->memPort[0].sendFunctional(new_pkt2);
343 if (suppress_func_errors) {
349 cuList[0]->memPort[0].sendFunctional(new_pkt);
365 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
370 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
377 MemCmd cmd,
bool suppress_func_errors)
379 uint8_t *data_buf = (uint8_t*)ptr;
382 !gen.
done(); gen.next()) {
385 gen.addr(), gen.size(), 0,
386 cuList[0]->requestorId(), 0, 0,
nullptr);
389 data_buf += gen.size();
401 bool suppress_func_errors)
404 suppress_func_errors);
415 bool suppress_func_errors)
418 suppress_func_errors);
437 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
475 Tick t1 = roundTripTime[0];
476 Tick t2 = roundTripTime[1];
477 Tick t3 = roundTripTime[2];
478 Tick t4 = roundTripTime[3];
479 Tick t5 = roundTripTime[4];
495 for (
auto& it : lineMap) {
497 if (timeVec.size() == 2) {
498 netTimes.push_back(timeVec[1] - timeVec[0]);
505 std::sort(netTimes.begin(), netTimes.end());
510 for (
auto& time : netTimes) {
520 "Invalid activeCu size\n");
536 : statistics::
Group(parent),
537 ADD_STAT(allLatencyDist,
"delay distribution for all"),
538 ADD_STAT(loadLatencyDist,
"delay distribution for loads"),
539 ADD_STAT(storeLatencyDist,
"delay distribution for stores"),
541 "Ticks from vmem inst initiateAcc to coalescer issue"),
543 "Ticks from coalescer issue to coalescer hit callback"),
545 "Ticks from coalescer hit callback to GM pipe enqueue"),
547 "Ticks queued in GM pipes ordered response buffer"),
549 "Number of cache lines for coalesced request"),
551 "Total ticks that any CU attached to this shader is active"),
553 "vector instruction source operand distribution"),
555 "vector instruction destination operand distribution")
558 .
init(0, 1600000, 10000)
562 .
init(0, 1600000, 10000)
566 .
init(0, 1600000, 10000)
570 .
init(0, 1600000, 10000)
574 .
init(0, 1600000, 10000)
578 .
init(0, 1600000, 10000)
582 .
init(0, 1600000, 10000)
593 for (
int idx = 0; idx < wf_size; ++idx) {
594 std::stringstream namestr;
595 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
598 .
init(0, 1600000, 10000)
600 .
desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
Tick curTick()
The universal simulation clock.
#define fatal(...)
This implements a cprintf based fatal() function.
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
ShaderStats(statistics::Group *parent, int wf_size)
std::vector< uint64_t > sa_when
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Tick when() const
Get the time that the event is scheduled.
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
std::vector< int32_t > sa_x
A simple distribution stat.
void sampleLoad(const Tick accessTime)
statistics::Vector vectorInstDstOperand
GPUDispatcher & _dispatcher
statistics::Distribution gmEnqueueLatency
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
std::vector< int * > sa_val
RequestPtr req
A pointer to the original request.
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
void updateContext(int cid)
std::vector< ComputeUnit * > cuList
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
void schedule(Event &event, Tick when)
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
statistics::Vector vectorInstSrcOperand
EventFunctionWrapper tickEvent
statistics::Distribution * cacheBlockRoundTrip
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
void ccprintf(cp::Print &print)
statistics::Scalar shaderActiveTicks
std::shared_ptr< MemState > memState
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
const FlagsType pdf
Print the percent of the total that this entry represents.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
statistics::Distribution rubyNetworkLatency
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
virtual std::string name() const
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
ProbePointArg< PacketInfo > Packet
Packet probe point.
uint64_t Tick
Tick count type.
gem5::Shader::ShaderStats stats
std::shared_ptr< Request > RequestPtr
void reschedule(Event &event, Tick when, bool always=false)
statistics::Distribution allLatencyDist
bool dispatchWorkgroups(HSAQueueEntry *task)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
bool dispComplete() const
statistics::Distribution initToCoalesceLatency
void prepareInvalidate(HSAQueueEntry *task)
statistics::Distribution storeLatencyDist
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
SenderState * senderState
This packet's sender state.
std::shared_ptr< GPUDynInst > GPUDynInstPtr
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual Process * getProcessPtr()=0
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
void ScheduleAdd(int *val, Tick when, int x)
GPUDispatcher & dispatcher()
const FlagsType oneline
Print all values on a single line.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
GPUCommandProcessor & gpuCmdProc
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
statistics::Distribution loadLatencyDist
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Derived & init(size_type size)
Set this vector to have the given size.
void sampleStore(const Tick accessTime)
void setSuppressFuncError()
bool scheduled() const
Determine if the current event is scheduled.
statistics::Distribution gmToCompleteLatency
statistics::Distribution coalsrLineAddresses
Generated on Wed Jul 13 2022 10:39:22 for gem5 by doxygen 1.8.17