Go to the documentation of this file.
41 #include "debug/GPUDisp.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUShader.hh"
44 #include "debug/GPUWgLatency.hh"
55 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
56 gpuTc(nullptr), cpuPointer(
p->cpu_pointer),
59 timingSim(
p->timing), hsail_mode(SIMT),
60 impl_kern_launch_acq(
p->impl_kern_launch_acq),
61 impl_kern_end_rel(
p->impl_kern_end_rel),
63 trace_vgpr_all(1), n_cu((
p->CUs).size()), n_wf(
p->n_wf),
64 globalMemSize(
p->globalmem),
65 nextSchedCu(0), sa_n(0), gpuCmdProc(*
p->gpu_cmd_proc),
66 _dispatcher(*
p->dispatcher),
67 max_valu_insts(
p->max_valu_insts), total_valu_insts(0)
69 gpuCmdProc.setShader(
this);
70 _dispatcher.setShader(
this);
72 _gpuVmApe.base = ((
Addr)1 << 61) + 0x1000000000000
L;
73 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
75 _ldsApe.base = ((
Addr)1 << 61) + 0x0;
76 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
78 _scratchApe.base = ((
Addr)1 << 61) + 0x100000000
L;
79 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
81 shHiddenPrivateBaseVmid = 0;
85 panic_if(n_wf <= 0,
"Must have at least 1 WF Slot per SIMD");
87 for (
int i = 0;
i < n_cu; ++
i) {
88 cuList[
i] =
p->CUs[
i];
89 assert(
i == cuList[
i]->cu_id);
90 cuList[
i]->shader =
this;
91 cuList[
i]->idleCUTimeout =
p->idlecu_timeout;
114 DPRINTF(GPUShader,
"GROWS DOWN");
115 start = mem_state->getMmapEnd() -
length;
116 mem_state->setMmapEnd(start);
118 DPRINTF(GPUShader,
"GROWS UP");
119 start = mem_state->getMmapEnd();
120 mem_state->setMmapEnd(start +
length);
123 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
124 mem_state->getMmapEnd());
127 DPRINTF(GPUShader,
"Shader::mmap start= %#x, %#x\n", start,
length);
158 ShaderParams::create()
169 for (
int i = 0;
i <
sa_n; ++
i) {
181 Tick shader_wakeup = *std::max_element(
sa_when.begin(),
183 DPRINTF(GPUDisp,
"Scheduling shader wakeup at %lu\n", shader_wakeup);
186 DPRINTF(GPUDisp,
"sa_when empty, shader going to sleep!\n");
205 for (
int i_cu = 0; i_cu <
n_cu; ++i_cu) {
208 auto req = std::make_shared<Request>(0, 0, 0,
209 cuList[i_cu]->requestorId(),
223 int kernId = gpuDynInst->kern_id;
231 cuList[i_cu]->doFlush(gpuDynInst);
237 bool scheduledSomething =
false;
241 while (cuCount <
n_cu) {
248 int num_wfs_in_wg = 0;
249 bool can_disp =
cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
251 scheduledSomething =
true;
252 DPRINTF(GPUDisp,
"Dispatching a workgroup to CU %d: WG %d\n",
254 DPRINTF(GPUWgLatency,
"WG Begin cycle:%d wg:%d cu:%d\n",
264 "Invalid activeCu size\n");
265 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
274 return scheduledSomething;
283 .
name(
name() +
".shader_active_ticks")
284 .
desc(
"Total ticks that any CU attached to this shader is active")
287 .
init(0, 1600000, 10000)
289 .
desc(
"delay distribution for all")
293 .
init(0, 1600000, 10000)
295 .
desc(
"delay distribution for loads")
299 .
init(0, 1600000, 10000)
301 .
desc(
"delay distribution for stores")
306 .
name(
name() +
".vec_inst_src_operand")
307 .
desc(
"vector instruction source operand distribution");
311 .
name(
name() +
".vec_inst_dst_operand")
312 .
desc(
"vector instruction destination operand distribution");
315 .
init(0, 1600000, 10000)
316 .
name(
name() +
".initToCoalesceLatency")
317 .
desc(
"Ticks from vmem inst initiateAcc to coalescer issue")
321 .
init(0, 1600000, 10000)
322 .
name(
name() +
".rubyNetworkLatency")
323 .
desc(
"Ticks from coalescer issue to coalescer hit callback")
327 .
init(0, 1600000, 10000)
329 .
desc(
"Ticks from coalescer hit callback to GM pipe enqueue")
333 .
init(0, 1600000, 10000)
334 .
name(
name() +
".gmToCompleteLatency")
335 .
desc(
"Ticks queued in GM pipes ordered response buffer")
340 .
name(
name() +
".coalsrLineAddresses")
341 .
desc(
"Number of cache lines for coalesced request")
344 int wfSize =
cuList[0]->wfSize();
346 for (
int idx = 0; idx < wfSize; ++idx) {
347 std::stringstream namestr;
348 ccprintf(namestr,
"%s.cacheBlockRoundTrip%d",
name(), idx);
350 .
init(0, 1600000, 10000)
352 .
desc(
"Coalsr-to-coalsr time for the Nth cache block in an inst")
359 bool suppress_func_errors,
int cu_id)
361 int block_size =
cuList.at(cu_id)->cacheLineSize();
362 unsigned size = req->getSize();
372 fatal(
"unexcepted MemCmd\n");
375 tmp_addr = req->getVaddr();
376 Addr split_addr =
roundDown(tmp_addr + size - 1, block_size);
378 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
381 if (split_addr > tmp_addr) {
383 req->splitOnVaddr(split_addr, req1, req2);
397 if (suppress_func_errors) {
404 cuList[0]->memPort[0].sendFunctional(new_pkt1);
405 cuList[0]->memPort[0].sendFunctional(new_pkt2);
417 if (suppress_func_errors) {
423 cuList[0]->memPort[0].sendFunctional(new_pkt);
439 DPRINTF(GPUDisp,
"New scheduled add; scheduling shader wakeup at "
444 DPRINTF(GPUDisp,
"New scheduled add; wakeup already scheduled at "
451 MemCmd cmd,
bool suppress_func_errors)
453 uint8_t *data_buf = (uint8_t*)ptr;
456 !gen.
done(); gen.next()) {
459 gen.addr(), gen.size(), 0,
460 cuList[0]->requestorId(), 0, 0,
nullptr);
463 data_buf += gen.size();
475 bool suppress_func_errors)
478 suppress_func_errors);
489 bool suppress_func_errors)
492 suppress_func_errors);
505 new TheISA::GpuTLB::TranslationState(
mode,
gpuTc,
false);
511 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
514 TheISA::GpuTLB::TranslationState *sender_state =
515 safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->
senderState);
517 delete sender_state->tlbEntry;
549 Tick t1 = roundTripTime[0];
550 Tick t2 = roundTripTime[1];
551 Tick t3 = roundTripTime[2];
552 Tick t4 = roundTripTime[3];
553 Tick t5 = roundTripTime[4];
569 for (
auto& it : lineMap) {
571 if (timeVec.size() == 2) {
572 netTimes.push_back(timeVec[1] - timeVec[0]);
579 std::sort(netTimes.begin(), netTimes.end());
584 for (
auto& time : netTimes) {
594 "Invalid activeCu size\n");
#define fatal(...)
This implements a cprintf based fatal() function.
virtual void regStats()
Callback to set stat parameters.
Stats::Vector vectorInstDstOperand
void setSuppressFuncError()
bool scheduled() const
Determine if the current event is scheduled.
std::vector< int32_t > sa_x
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
void updateContext(int cid)
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Stats::Distribution gmToCompleteLatency
EventFunctionWrapper tickEvent
void reschedule(Event &event, Tick when, bool always=false)
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
uint64_t Tick
Tick count type.
Stats::Distribution allLatencyDist
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
std::shared_ptr< Request > RequestPtr
RequestPtr req
A pointer to the original request.
Stats::Distribution gmEnqueueLatency
void prepareInvalidate(HSAQueueEntry *task)
virtual Process * getProcessPtr()=0
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Tick when() const
Get the time that the event is scheduled.
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Stats::Distribution storeLatencyDist
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
GPUDispatcher & dispatcher()
Stats::Distribution initToCoalesceLatency
void schedule(Event &event, Tick when)
void regStats()
Callback to set stat parameters.
Stats::Vector vectorInstSrcOperand
const FlagsType oneline
Print all values on a single line.
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
std::vector< int * > sa_val
Stats::Scalar shaderActiveTicks
Statistics.
ProbePointArg< PacketInfo > Packet
Packet probe point.
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
A simple distribution stat.
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Derived & init(size_type size)
Set this vector to have the given size.
virtual const std::string name() const
Stats::Distribution * cacheBlockRoundTrip
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
std::vector< uint64_t > sa_when
void ScheduleAdd(int *val, Tick when, int x)
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
const FlagsType pdf
Print the percent of the total that this entry represents.
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Stats::Distribution loadLatencyDist
void ccprintf(cp::Print &print)
std::shared_ptr< GPUDynInst > GPUDynInstPtr
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Stats::Distribution coalsrLineAddresses
bool dispComplete() const
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
SenderState * senderState
This packet's sender state.
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Stats::Distribution rubyNetworkLatency
void sampleLoad(const Tick accessTime)
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
bool dispatchWorkgroups(HSAQueueEntry *task)
GPUDispatcher & _dispatcher
void sampleStore(const Tick accessTime)
std::shared_ptr< MemState > memState
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
std::vector< ComputeUnit * > cuList
Tick curTick()
The current simulated tick.
Generated on Wed Sep 30 2020 14:02:12 for gem5 by doxygen 1.8.17