Go to the documentation of this file.
38 #include "debug/GPUAgentDisp.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUMem.hh"
41 #include "debug/GPUShader.hh"
42 #include "debug/GPUWgLatency.hh"
56 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
57 gpuTc(nullptr), cpuPointer(
p.cpu_pointer),
60 timingSim(
p.timing), hsail_mode(SIMT),
61 impl_kern_launch_acq(
p.impl_kern_launch_acq),
62 impl_kern_end_rel(
p.impl_kern_end_rel),
64 trace_vgpr_all(1), n_cu((
p.CUs).size()), n_wf(
p.n_wf),
65 globalMemSize(
p.globalmem),
66 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p.gpu_cmd_proc),
67 _dispatcher(*
p.dispatcher),
68 max_valu_insts(
p.max_valu_insts), total_valu_insts(0),
69 stats(
this,
p.CUs[0]->wfSize())
71 gpuCmdProc.setShader(
this);
72 _dispatcher.setShader(
this);
74 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000
L;
75 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
77 _ldsApe.base = ((
Addr)1 << 61) + 0x0;
78 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
80 _scratchApe.base = ((
Addr)1 << 61) + 0x100000000
L;
81 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
83 shHiddenPrivateBaseVmid = 0;
87 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
89 for (
int i = 0;
i < n_cu; ++
i) {
91 assert(
i == cuList[
i]->cu_id);
92 cuList[
i]->shader =
this;
93 cuList[
i]->idleCUTimeout =
p.idlecu_timeout;
116 DPRINTF(GPUShader,
"GROWS DOWN");
117 start = mem_state->getMmapEnd() - length;
118 mem_state->setMmapEnd(start);
120 DPRINTF(GPUShader,
"GROWS UP");
121 start = mem_state->getMmapEnd();
122 mem_state->setMmapEnd(start + length);
125 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
126 mem_state->getMmapEnd());
129 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start, length);
165 for (
int i = 0;
i <
sa_n; ++
i) {
177 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
179 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
182 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
201 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
204 auto req = std::make_shared<Request>(0, 0, 0,
205 cuList[i_cu]->requestorId(),
213 cuList[i_cu]->resetRegisterPool();
222 int kernId = gpuDynInst->kern_id;
230 cuList[i_cu]->doFlush(gpuDynInst);
236 bool scheduledSomething =
false;
241 while (cuCount <
n_cu) {
248 int num_wfs_in_wg = 0;
249 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
251 scheduledSomething =
true;
252 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
254 DPRINTF(GPUAgentDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
256 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
266 "Invalid activeCu size\n");
267 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
277 DPRINTF(GPUWgLatency,
"Shader Dispatched %d Wgs\n", disp_count);
279 return scheduledSomething;
284 bool suppress_func_errors,
int cu_id)
286 int block_size =
cuList.at(cu_id)->cacheLineSize();
287 unsigned size = req->getSize();
297 fatal(
"unexcepted MemCmd\n");
300 tmp_addr = req->getVaddr();
301 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
303 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
306 if (split_addr > tmp_addr) {
308 req->splitOnVaddr(split_addr, req1, req2);
322 if (suppress_func_errors) {
329 cuList[0]->memPort[0].sendFunctional(new_pkt1);
330 cuList[0]->memPort[0].sendFunctional(new_pkt2);
342 if (suppress_func_errors) {
348 cuList[0]->memPort[0].sendFunctional(new_pkt);
364 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
369 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
376 MemCmd cmd,
bool suppress_func_errors)
378 uint8_t *data_buf = (uint8_t*)ptr;
381 !gen.
done(); gen.next()) {
384 gen.addr(), gen.size(), 0,
385 cuList[0]->requestorId(), 0, 0,
nullptr);
388 data_buf += gen.size();
400 bool suppress_func_errors)
403 suppress_func_errors);
414 bool suppress_func_errors)
417 suppress_func_errors);
436 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
474 Tick t1 = roundTripTime[0];
475 Tick t2 = roundTripTime[1];
476 Tick t3 = roundTripTime[2];
477 Tick t4 = roundTripTime[3];
478 Tick t5 = roundTripTime[4];
494 for (
auto& it : lineMap) {
496 if (timeVec.size() == 2) {
497 netTimes.push_back(timeVec[1] - timeVec[0]);
504 std::sort(netTimes.begin(), netTimes.end());
509 for (
auto& time : netTimes) {
519 "Invalid activeCu size\n");
526 : statistics::
Group(parent),
527 ADD_STAT(allLatencyDist,
"delay distribution for all"),
528 ADD_STAT(loadLatencyDist,
"delay distribution for loads"),
529 ADD_STAT(storeLatencyDist,
"delay distribution for stores"),
531 "Ticks from vmem inst initiateAcc to coalescer issue"),
533 "Ticks from coalescer issue to coalescer hit callback"),
535 "Ticks from coalescer hit callback to GM pipe enqueue"),
537 "Ticks queued in GM pipes ordered response buffer"),
539 "Number of cache lines for coalesced request"),
541 "Total ticks that any CU attached to this shader is active"),
543 "vector instruction source operand distribution"),
545 "vector instruction destination operand distribution")
548 .
init(0, 1600000, 10000)
552 .
init(0, 1600000, 10000)
556 .
init(0, 1600000, 10000)
560 .
init(0, 1600000, 10000)
564 .
init(0, 1600000, 10000)
568 .
init(0, 1600000, 10000)
572 .
init(0, 1600000, 10000)
583 for (
int idx = 0; idx < wf_size; ++idx) {
584 std::stringstream namestr;
585 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
588 .
init(0, 1600000, 10000)
590 .
desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
Tick curTick()
The universal simulation clock.
#define fatal(...)
This implements a cprintf based fatal() function.
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
ShaderStats(statistics::Group *parent, int wf_size)
std::vector< uint64_t > sa_when
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Tick when() const
Get the time that the event is scheduled.
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
std::vector< int32_t > sa_x
A simple distribution stat.
void sampleLoad(const Tick accessTime)
statistics::Vector vectorInstDstOperand
GPUDispatcher & _dispatcher
statistics::Distribution gmEnqueueLatency
std::vector< int * > sa_val
RequestPtr req
A pointer to the original request.
void updateContext(int cid)
std::vector< ComputeUnit * > cuList
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
void schedule(Event &event, Tick when)
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
statistics::Vector vectorInstSrcOperand
EventFunctionWrapper tickEvent
statistics::Distribution * cacheBlockRoundTrip
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
void ccprintf(cp::Print &print)
statistics::Scalar shaderActiveTicks
std::shared_ptr< MemState > memState
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
const FlagsType pdf
Print the percent of the total that this entry represents.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
statistics::Distribution rubyNetworkLatency
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
virtual std::string name() const
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
ProbePointArg< PacketInfo > Packet
Packet probe point.
uint64_t Tick
Tick count type.
gem5::Shader::ShaderStats stats
std::shared_ptr< Request > RequestPtr
void reschedule(Event &event, Tick when, bool always=false)
statistics::Distribution allLatencyDist
bool dispatchWorkgroups(HSAQueueEntry *task)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
bool dispComplete() const
statistics::Distribution initToCoalesceLatency
void prepareInvalidate(HSAQueueEntry *task)
statistics::Distribution storeLatencyDist
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
SenderState * senderState
This packet's sender state.
X86ISA::GpuTLB::TranslationState GpuTranslationState
std::shared_ptr< GPUDynInst > GPUDynInstPtr
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
virtual Process * getProcessPtr()=0
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
void ScheduleAdd(int *val, Tick when, int x)
GPUDispatcher & dispatcher()
const FlagsType oneline
Print all values on a single line.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
statistics::Distribution loadLatencyDist
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Derived & init(size_type size)
Set this vector to have the given size.
void sampleStore(const Tick accessTime)
void setSuppressFuncError()
bool scheduled() const
Determine if the current event is scheduled.
statistics::Distribution gmToCompleteLatency
statistics::Distribution coalsrLineAddresses
Generated on Wed May 4 2022 12:13:58 for gem5 by doxygen 1.8.17