Go to the documentation of this file.
41 #include "debug/GPUAgentDisp.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUShader.hh"
45 #include "debug/GPUWgLatency.hh"
56 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
57 gpuTc(nullptr), cpuPointer(
p.cpu_pointer),
60 timingSim(
p.timing), hsail_mode(SIMT),
61 impl_kern_launch_acq(
p.impl_kern_launch_acq),
62 impl_kern_end_rel(
p.impl_kern_end_rel),
64 trace_vgpr_all(1), n_cu((
p.CUs).size()), n_wf(
p.n_wf),
65 globalMemSize(
p.globalmem),
66 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p.gpu_cmd_proc),
67 _dispatcher(*
p.dispatcher),
68 max_valu_insts(
p.max_valu_insts), total_valu_insts(0),
69 stats(
this,
p.CUs[0]->wfSize())
71 gpuCmdProc.setShader(
this);
72 _dispatcher.setShader(
this);
74 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000
L;
75 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
77 _ldsApe.base = ((
Addr)1 << 61) + 0x0;
78 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
80 _scratchApe.base = ((
Addr)1 << 61) + 0x100000000
L;
81 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
83 shHiddenPrivateBaseVmid = 0;
87 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
89 for (
int i = 0;
i < n_cu; ++
i) {
91 assert(
i == cuList[
i]->cu_id);
92 cuList[
i]->shader =
this;
93 cuList[
i]->idleCUTimeout =
p.idlecu_timeout;
116 DPRINTF(GPUShader,
"GROWS DOWN");
117 start = mem_state->getMmapEnd() - length;
118 mem_state->setMmapEnd(start);
120 DPRINTF(GPUShader,
"GROWS UP");
121 start = mem_state->getMmapEnd();
122 mem_state->setMmapEnd(start + length);
125 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
126 mem_state->getMmapEnd());
129 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start, length);
165 for (
int i = 0;
i <
sa_n; ++
i) {
177 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
179 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
182 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
201 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
204 auto req = std::make_shared<Request>(0, 0, 0,
205 cuList[i_cu]->requestorId(),
213 cuList[i_cu]->resetRegisterPool();
222 int kernId = gpuDynInst->kern_id;
230 cuList[i_cu]->doFlush(gpuDynInst);
236 bool scheduledSomething =
false;
241 while (cuCount <
n_cu) {
248 int num_wfs_in_wg = 0;
249 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
251 scheduledSomething =
true;
252 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
254 DPRINTF(GPUAgentDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
256 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
266 "Invalid activeCu size\n");
267 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
277 DPRINTF(GPUWgLatency,
"Shader Dispatched %d Wgs\n", disp_count);
279 return scheduledSomething;
284 bool suppress_func_errors,
int cu_id)
286 int block_size =
cuList.at(cu_id)->cacheLineSize();
287 unsigned size = req->getSize();
297 fatal(
"unexcepted MemCmd\n");
300 tmp_addr = req->getVaddr();
301 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
303 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
306 if (split_addr > tmp_addr) {
308 req->splitOnVaddr(split_addr, req1, req2);
322 if (suppress_func_errors) {
329 cuList[0]->memPort[0].sendFunctional(new_pkt1);
330 cuList[0]->memPort[0].sendFunctional(new_pkt2);
342 if (suppress_func_errors) {
348 cuList[0]->memPort[0].sendFunctional(new_pkt);
364 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
369 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
376 MemCmd cmd,
bool suppress_func_errors)
378 uint8_t *data_buf = (uint8_t*)ptr;
381 !gen.
done(); gen.next()) {
384 gen.addr(), gen.size(), 0,
385 cuList[0]->requestorId(), 0, 0,
nullptr);
388 data_buf += gen.size();
400 bool suppress_func_errors)
403 suppress_func_errors);
414 bool suppress_func_errors)
417 suppress_func_errors);
430 new TheISA::GpuTLB::TranslationState(
mode,
gpuTc,
false);
436 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
439 TheISA::GpuTLB::TranslationState *sender_state =
440 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
442 delete sender_state->tlbEntry;
474 Tick t1 = roundTripTime[0];
475 Tick t2 = roundTripTime[1];
476 Tick t3 = roundTripTime[2];
477 Tick t4 = roundTripTime[3];
478 Tick t5 = roundTripTime[4];
494 for (
auto& it : lineMap) {
496 if (timeVec.size() == 2) {
497 netTimes.push_back(timeVec[1] - timeVec[0]);
504 std::sort(netTimes.begin(), netTimes.end());
509 for (
auto& time : netTimes) {
519 "Invalid activeCu size\n");
526 :
Stats::Group(parent),
527 ADD_STAT(allLatencyDist,
"delay distribution for all"),
528 ADD_STAT(loadLatencyDist,
"delay distribution for loads"),
529 ADD_STAT(storeLatencyDist,
"delay distribution for stores"),
531 "Ticks from vmem inst initiateAcc to coalescer issue"),
533 "Ticks from coalescer issue to coalescer hit callback"),
535 "Ticks from coalescer hit callback to GM pipe enqueue"),
537 "Ticks queued in GM pipes ordered response buffer"),
539 "Number of cache lines for coalesced request"),
541 "Total ticks that any CU attached to this shader is active"),
543 "vector instruction source operand distribution"),
545 "vector instruction destination operand distribution")
548 .
init(0, 1600000, 10000)
552 .
init(0, 1600000, 10000)
556 .
init(0, 1600000, 10000)
560 .
init(0, 1600000, 10000)
564 .
init(0, 1600000, 10000)
568 .
init(0, 1600000, 10000)
572 .
init(0, 1600000, 10000)
583 for (
int idx = 0; idx < wf_size; ++idx) {
584 std::stringstream namestr;
585 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
588 .
init(0, 1600000, 10000)
590 .
desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
#define fatal(...)
This implements a cprintf based fatal() function.
void setSuppressFuncError()
bool scheduled() const
Determine if the current event is scheduled.
std::vector< int32_t > sa_x
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Shader::ShaderStats stats
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
void updateContext(int cid)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Stats::Distribution initToCoalesceLatency
EventFunctionWrapper tickEvent
void reschedule(Event &event, Tick when, bool always=false)
Stats::Vector vectorInstDstOperand
Stats::Distribution rubyNetworkLatency
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
uint64_t Tick
Tick count type.
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
std::shared_ptr< Request > RequestPtr
RequestPtr req
A pointer to the original request.
void prepareInvalidate(HSAQueueEntry *task)
virtual Process * getProcessPtr()=0
Stats::Distribution loadLatencyDist
Stats::Vector vectorInstSrcOperand
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Tick when() const
Get the time that the event is scheduled.
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Stats::Scalar shaderActiveTicks
GPUDispatcher & dispatcher()
void schedule(Event &event, Tick when)
const FlagsType oneline
Print all values on a single line.
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Stats::Distribution coalsrLineAddresses
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
std::vector< int * > sa_val
Stats::Distribution allLatencyDist
Stats::Distribution * cacheBlockRoundTrip
ProbePointArg< PacketInfo > Packet
Packet probe point.
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
A simple distribution stat.
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Derived & init(size_type size)
Set this vector to have the given size.
virtual const std::string name() const
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
std::vector< uint64_t > sa_when
void ScheduleAdd(int *val, Tick when, int x)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
ShaderStats(Stats::Group *parent, int wf_size)
T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
const FlagsType pdf
Print the percent of the total that this entry represents.
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void ccprintf(cp::Print &print)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
bool dispComplete() const
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Tick curTick()
The universal simulation clock.
SenderState * senderState
This packet's sender state.
Stats::Distribution gmEnqueueLatency
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
void sampleLoad(const Tick accessTime)
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
bool dispatchWorkgroups(HSAQueueEntry *task)
Stats::Distribution gmToCompleteLatency
GPUDispatcher & _dispatcher
void sampleStore(const Tick accessTime)
std::shared_ptr< MemState > memState
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
std::vector< ComputeUnit * > cuList
Stats::Distribution storeLatencyDist
Generated on Tue Mar 23 2021 19:41:27 for gem5 by doxygen 1.8.17