Go to the documentation of this file.
41 #include "debug/GPUAgentDisp.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUShader.hh"
45 #include "debug/GPUWgLatency.hh"
59 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
60 gpuTc(nullptr), cpuPointer(
p.cpu_pointer),
63 timingSim(
p.timing), hsail_mode(SIMT),
64 impl_kern_launch_acq(
p.impl_kern_launch_acq),
65 impl_kern_end_rel(
p.impl_kern_end_rel),
67 trace_vgpr_all(1), n_cu((
p.CUs).size()), n_wf(
p.n_wf),
68 globalMemSize(
p.globalmem),
69 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p.gpu_cmd_proc),
70 _dispatcher(*
p.dispatcher),
71 max_valu_insts(
p.max_valu_insts), total_valu_insts(0),
72 stats(
this,
p.CUs[0]->wfSize())
74 gpuCmdProc.setShader(
this);
75 _dispatcher.setShader(
this);
77 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000
L;
78 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
80 _ldsApe.base = ((
Addr)1 << 61) + 0x0;
81 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
83 _scratchApe.base = ((
Addr)1 << 61) + 0x100000000
L;
84 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
86 shHiddenPrivateBaseVmid = 0;
90 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
92 for (
int i = 0;
i < n_cu; ++
i) {
94 assert(
i == cuList[
i]->cu_id);
95 cuList[
i]->shader =
this;
96 cuList[
i]->idleCUTimeout =
p.idlecu_timeout;
119 DPRINTF(GPUShader,
"GROWS DOWN");
120 start = mem_state->getMmapEnd() - length;
121 mem_state->setMmapEnd(start);
123 DPRINTF(GPUShader,
"GROWS UP");
124 start = mem_state->getMmapEnd();
125 mem_state->setMmapEnd(start + length);
128 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
129 mem_state->getMmapEnd());
132 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start, length);
168 for (
int i = 0;
i <
sa_n; ++
i) {
180 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
182 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
185 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
204 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
207 auto req = std::make_shared<Request>(0, 0, 0,
208 cuList[i_cu]->requestorId(),
216 cuList[i_cu]->resetRegisterPool();
225 int kernId = gpuDynInst->kern_id;
233 cuList[i_cu]->doFlush(gpuDynInst);
239 bool scheduledSomething =
false;
244 while (cuCount <
n_cu) {
251 int num_wfs_in_wg = 0;
252 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
254 scheduledSomething =
true;
255 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
257 DPRINTF(GPUAgentDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
259 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
269 "Invalid activeCu size\n");
270 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
280 DPRINTF(GPUWgLatency,
"Shader Dispatched %d Wgs\n", disp_count);
282 return scheduledSomething;
287 bool suppress_func_errors,
int cu_id)
289 int block_size =
cuList.at(cu_id)->cacheLineSize();
290 unsigned size = req->getSize();
300 fatal(
"unexcepted MemCmd\n");
303 tmp_addr = req->getVaddr();
304 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
306 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
309 if (split_addr > tmp_addr) {
311 req->splitOnVaddr(split_addr, req1, req2);
325 if (suppress_func_errors) {
332 cuList[0]->memPort[0].sendFunctional(new_pkt1);
333 cuList[0]->memPort[0].sendFunctional(new_pkt2);
345 if (suppress_func_errors) {
351 cuList[0]->memPort[0].sendFunctional(new_pkt);
367 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
372 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
379 MemCmd cmd,
bool suppress_func_errors)
381 uint8_t *data_buf = (uint8_t*)ptr;
384 !gen.
done(); gen.next()) {
387 gen.addr(), gen.size(), 0,
388 cuList[0]->requestorId(), 0, 0,
nullptr);
391 data_buf += gen.size();
403 bool suppress_func_errors)
406 suppress_func_errors);
417 bool suppress_func_errors)
420 suppress_func_errors);
433 new TheISA::GpuTLB::TranslationState(
mode,
gpuTc,
false);
439 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
442 TheISA::GpuTLB::TranslationState *sender_state =
443 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
445 delete sender_state->tlbEntry;
477 Tick t1 = roundTripTime[0];
478 Tick t2 = roundTripTime[1];
479 Tick t3 = roundTripTime[2];
480 Tick t4 = roundTripTime[3];
481 Tick t5 = roundTripTime[4];
497 for (
auto& it : lineMap) {
499 if (timeVec.size() == 2) {
500 netTimes.push_back(timeVec[1] - timeVec[0]);
507 std::sort(netTimes.begin(), netTimes.end());
512 for (
auto& time : netTimes) {
522 "Invalid activeCu size\n");
529 : statistics::
Group(parent),
530 ADD_STAT(allLatencyDist,
"delay distribution for all"),
531 ADD_STAT(loadLatencyDist,
"delay distribution for loads"),
532 ADD_STAT(storeLatencyDist,
"delay distribution for stores"),
534 "Ticks from vmem inst initiateAcc to coalescer issue"),
536 "Ticks from coalescer issue to coalescer hit callback"),
538 "Ticks from coalescer hit callback to GM pipe enqueue"),
540 "Ticks queued in GM pipes ordered response buffer"),
542 "Number of cache lines for coalesced request"),
544 "Total ticks that any CU attached to this shader is active"),
546 "vector instruction source operand distribution"),
548 "vector instruction destination operand distribution")
551 .
init(0, 1600000, 10000)
555 .
init(0, 1600000, 10000)
559 .
init(0, 1600000, 10000)
563 .
init(0, 1600000, 10000)
567 .
init(0, 1600000, 10000)
571 .
init(0, 1600000, 10000)
575 .
init(0, 1600000, 10000)
586 for (
int idx = 0; idx < wf_size; ++idx) {
587 std::stringstream namestr;
588 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
591 .
init(0, 1600000, 10000)
593 .
desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
Tick curTick()
The universal simulation clock.
#define fatal(...)
This implements a cprintf based fatal() function.
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
ShaderStats(statistics::Group *parent, int wf_size)
std::vector< uint64_t > sa_when
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Tick when() const
Get the time that the event is scheduled.
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
std::vector< int32_t > sa_x
A simple distribution stat.
void sampleLoad(const Tick accessTime)
statistics::Vector vectorInstDstOperand
GPUDispatcher & _dispatcher
statistics::Distribution gmEnqueueLatency
std::vector< int * > sa_val
RequestPtr req
A pointer to the original request.
void updateContext(int cid)
std::vector< ComputeUnit * > cuList
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
void schedule(Event &event, Tick when)
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
statistics::Vector vectorInstSrcOperand
EventFunctionWrapper tickEvent
statistics::Distribution * cacheBlockRoundTrip
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
void ccprintf(cp::Print &print)
statistics::Scalar shaderActiveTicks
std::shared_ptr< MemState > memState
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
const FlagsType pdf
Print the percent of the total that this entry represents.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
statistics::Distribution rubyNetworkLatency
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
virtual std::string name() const
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
ProbePointArg< PacketInfo > Packet
Packet probe point.
uint64_t Tick
Tick count type.
gem5::Shader::ShaderStats stats
std::shared_ptr< Request > RequestPtr
void reschedule(Event &event, Tick when, bool always=false)
statistics::Distribution allLatencyDist
bool dispatchWorkgroups(HSAQueueEntry *task)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
bool dispComplete() const
statistics::Distribution initToCoalesceLatency
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
void prepareInvalidate(HSAQueueEntry *task)
statistics::Distribution storeLatencyDist
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
SenderState * senderState
This packet's sender state.
std::shared_ptr< GPUDynInst > GPUDynInstPtr
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual Process * getProcessPtr()=0
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
void ScheduleAdd(int *val, Tick when, int x)
GPUDispatcher & dispatcher()
const FlagsType oneline
Print all values on a single line.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
statistics::Distribution loadLatencyDist
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Derived & init(size_type size)
Set this vector to have the given size.
void sampleStore(const Tick accessTime)
void setSuppressFuncError()
bool scheduled() const
Determine if the current event is scheduled.
statistics::Distribution gmToCompleteLatency
statistics::Distribution coalsrLineAddresses
Generated on Tue Sep 7 2021 14:53:47 for gem5 by doxygen 1.8.17