gem5 v24.1.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
shader.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "gpu-compute/shader.hh"
33
34#include <limits>
35
39#include "debug/GPUAgentDisp.hh"
40#include "debug/GPUDisp.hh"
41#include "debug/GPUMem.hh"
42#include "debug/GPUShader.hh"
43#include "debug/GPUWgLatency.hh"
50#include "mem/packet.hh"
52#include "sim/sim_exit.hh"
53
54namespace gem5
55{
56
58 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
59 gpuTc(nullptr), cpuPointer(p.cpu_pointer),
60 tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
61 false, Event::CPU_Tick_Pri),
62 timingSim(p.timing), hsail_mode(SIMT),
63 impl_kern_launch_acq(p.impl_kern_launch_acq),
64 impl_kern_end_rel(p.impl_kern_end_rel),
65 coissue_return(1),
66 trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
67 n_cu_per_sqc(p.cu_per_sqc),
68 globalMemSize(p.globalmem),
69 nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
70 _dispatcher(*p.dispatcher), systemHub(p.system_hub),
71 max_valu_insts(p.max_valu_insts), total_valu_insts(0),
72 stats(this, p.CUs[0]->wfSize())
73{
74 gpuCmdProc.setShader(this);
75 _dispatcher.setShader(this);
76
77 // These apertures are set by the driver. In full system mode that is done
78 // using a PM4 packet but the emulated SE mode driver does not set them
79 // explicitly, so we need to define some reasonable defaults here.
80 _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
81 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
82
83 _ldsApe.base = 0x1000000000000;
84 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
85
86 _scratchApe.base = 0x2000000000000;
87 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
88
89 // The scratch and LDS address can be queried starting in gfx900. The
90 // base addresses are in the SH_MEM_BASES 32-bit register. The upper 16
91 // bits are for the LDS address and the lower 16 bits are for scratch
92 // address. In both cases the 16 bits represent bits 63:48 of the address.
93 // This means bits 47:0 of the base address is always zero.
94 setHwReg(HW_REG_SH_MEM_BASES, 0x00010002);
95
96 shHiddenPrivateBaseVmid = 0;
97
98 cuList.resize(n_cu);
99
100 panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
101
102 for (int i = 0; i < n_cu; ++i) {
103 cuList[i] = p.CUs[i];
104 assert(i == cuList[i]->cu_id);
105 cuList[i]->shader = this;
106 cuList[i]->idleCUTimeout = p.idlecu_timeout;
107 }
108}
109
110GPUDispatcher&
112{
113 return _dispatcher;
114}
115
116Addr
117Shader::mmap(int length)
118{
119
120 Addr start;
121
122 // round up length to the next page
123 length = roundUp(length, X86ISA::PageBytes);
124
125 Process *proc = gpuTc->getProcessPtr();
126 auto mem_state = proc->memState;
127
128 if (proc->mmapGrowsDown()) {
129 DPRINTF(GPUShader, "GROWS DOWN");
130 start = mem_state->getMmapEnd() - length;
131 mem_state->setMmapEnd(start);
132 } else {
133 DPRINTF(GPUShader, "GROWS UP");
134 start = mem_state->getMmapEnd();
135 mem_state->setMmapEnd(start + length);
136
137 // assertion to make sure we don't overwrite the stack (it grows down)
138 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
139 mem_state->getMmapEnd());
140 }
141
142 DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
143
144 proc->allocateMem(start, length);
145
146 return start;
147}
148
149void
151{
152 // grab the threadContext of the thread running on the CPU
153 assert(cpuPointer);
155 assert(gpuTc);
156}
157
159{
160 for (int j = 0; j < n_cu; ++j)
161 delete cuList[j];
162}
163
164void
166 // context of the thread which dispatched work
167 assert(cpuPointer);
169 assert(gpuTc);
170}
171
172void
174{
175 assert(!sa_when.empty());
176
177 // apply any scheduled adds
178 for (int i = 0; i < sa_n; ++i) {
179 if (sa_when[i] <= curTick()) {
180 *sa_val[i] += sa_x[i];
181 panic_if(*sa_val[i] < 0, "Negative counter value\n");
182 sa_val.erase(sa_val.begin() + i);
183 sa_x.erase(sa_x.begin() + i);
184 sa_when.erase(sa_when.begin() + i);
185 --sa_n;
186 --i;
187 }
188 }
189 if (!sa_when.empty()) {
190 Tick shader_wakeup = *std::max_element(sa_when.begin(),
191 sa_when.end());
192 DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
193 schedule(tickEvent, shader_wakeup);
194 } else {
195 DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
196 }
197}
198
199/*
200 * dispatcher/shader arranges invalidate requests to the CUs
201 */
202void
204 // if invalidate has already started/finished, then do nothing
205 if (task->isInvStarted()) return;
206
207 // invalidate has never started; it can only perform once at kernel launch
208 assert(task->outstandingInvs() == -1);
209 int kernId = task->dispatchId();
210 // counter value is 0 now, indicating the inv is about to start
211 _dispatcher.updateInvCounter(kernId, +1);
212
213 // iterate all cus managed by the shader, to perform invalidate.
214 for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
215 // create a request to hold INV info; the request's fields will
216 // be updated in cu before use
217 auto tcc_req = std::make_shared<Request>(0, 0, 0,
218 cuList[i_cu]->requestorId(),
219 0, -1);
220
221 _dispatcher.updateInvCounter(kernId, +1);
222 // all necessary INV flags are all set now, call cu to execute
223 cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId());
224
225
226 // A set of CUs share a single SQC cache. Send a single invalidate
227 // request to each SQC
228 auto sqc_req = std::make_shared<Request>(0, 0, 0,
229 cuList[i_cu]->requestorId(),
230 0, -1);
231
232 if ((i_cu % n_cu_per_sqc) == 0) {
233 cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId());
234 }
235
236 // I don't like this. This is intrusive coding.
237 cuList[i_cu]->resetRegisterPool();
238 }
239}
240
244void
246 int kernId = gpuDynInst->kern_id;
247 // flush has never been started, performed only once at kernel end
248 assert(_dispatcher.getOutstandingWbs(kernId) == 0);
249
250 // the first cu, managed by the shader, performs flush operation,
251 // assuming that L2 cache is shared by all cus in the shader
252 int i_cu = 0;
253 _dispatcher.updateWbCounter(kernId, +1);
254 cuList[i_cu]->doFlush(gpuDynInst);
255}
256
257bool
259{
260 bool scheduledSomething = false;
261 int cuCount = 0;
262 int curCu = nextSchedCu;
263 int disp_count(0);
264
265 while (cuCount < n_cu) {
266 //Every time we try a CU, update nextSchedCu
267 nextSchedCu = (nextSchedCu + 1) % n_cu;
268
269 // dispatch workgroup iff the following two conditions are met:
270 // (a) wg_rem is true - there are unassigned workgroups in the grid
271 // (b) there are enough free slots in cu cuList[i] for this wg
272 int num_wfs_in_wg = 0;
273 bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
274 if (!task->dispComplete() && can_disp) {
275 scheduledSomething = true;
276 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
277 curCu, task->globalWgId());
278 DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
279 curCu, task->globalWgId());
280 DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
281 curTick(), task->globalWgId(), curCu);
282
283 if (!cuList[curCu]->tickEvent.scheduled()) {
284 if (!_activeCus)
286 _activeCus++;
287 }
288
289 panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
290 "Invalid activeCu size\n");
291 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
292
293 task->markWgDispatch();
294 ++disp_count;
295 }
296
297 ++cuCount;
298 curCu = nextSchedCu;
299 }
300
301 DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
302
303 return scheduledSomething;
304}
305
306void
308 bool suppress_func_errors, int cu_id)
309{
310 int block_size = cuList.at(cu_id)->cacheLineSize();
311 unsigned size = req->getSize();
312
313 Addr tmp_addr;
314 BaseMMU::Mode trans_mode;
315
316 if (cmd == MemCmd::ReadReq) {
317 trans_mode = BaseMMU::Read;
318 } else if (cmd == MemCmd::WriteReq) {
319 trans_mode = BaseMMU::Write;
320 } else {
321 fatal("unexcepted MemCmd\n");
322 }
323
324 tmp_addr = req->getVaddr();
325 Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
326
327 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
328
329 // Misaligned access
330 if (split_addr > tmp_addr) {
331 RequestPtr req1, req2;
332 req->splitOnVaddr(split_addr, req1, req2);
333
334 PacketPtr pkt1 = new Packet(req2, cmd);
335 PacketPtr pkt2 = new Packet(req1, cmd);
336
337 functionalTLBAccess(pkt1, cu_id, trans_mode);
338 functionalTLBAccess(pkt2, cu_id, trans_mode);
339
340 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
341 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
342
343 new_pkt1->dataStatic(data);
344 new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
345
346 if (suppress_func_errors) {
347 new_pkt1->setSuppressFuncError();
348 new_pkt2->setSuppressFuncError();
349 }
350
351 // fixme: this should be cuList[cu_id] if cu_id != n_cu
352 // The latter requires a memPort in the dispatcher
353 cuList[0]->memPort[0].sendFunctional(new_pkt1);
354 cuList[0]->memPort[0].sendFunctional(new_pkt2);
355
356 delete new_pkt1;
357 delete new_pkt2;
358 delete pkt1;
359 delete pkt2;
360 } else {
361 PacketPtr pkt = new Packet(req, cmd);
362 functionalTLBAccess(pkt, cu_id, trans_mode);
363 PacketPtr new_pkt = new Packet(pkt->req, cmd);
364 new_pkt->dataStatic(data);
365
366 if (suppress_func_errors) {
367 new_pkt->setSuppressFuncError();
368 };
369
370 // fixme: this should be cuList[cu_id] if cu_id != n_cu
371 // The latter requires a memPort in the dispatcher
372 cuList[0]->memPort[0].sendFunctional(new_pkt);
373
374 delete new_pkt;
375 delete pkt;
376 }
377}
378
379void
381{
382 sa_val.push_back(val);
383 when += curTick();
384 sa_when.push_back(when);
385 sa_x.push_back(x);
386 ++sa_n;
387 if (!tickEvent.scheduled() || (when < tickEvent.when())) {
388 DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
389 "%lu\n", when);
390 reschedule(tickEvent, when, true);
391 } else {
392 assert(tickEvent.scheduled());
393 DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
394 "%lu\n", when);
395 }
396}
397
398void
399Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
400 MemCmd cmd, bool suppress_func_errors)
401{
402 uint8_t *data_buf = (uint8_t*)ptr;
403
404 for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
405 !gen.done(); gen.next()) {
406
407 RequestPtr req = std::make_shared<Request>(
408 gen.addr(), gen.size(), 0,
409 cuList[0]->requestorId(), 0, 0, nullptr);
410
411 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
412 data_buf += gen.size();
413 }
414}
415
416void
417Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
418{
419 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
420}
421
422void
423Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
424 bool suppress_func_errors)
425{
426 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
427 suppress_func_errors);
428}
429
430void
431Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
432{
433 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
434}
435
436void
437Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
438 bool suppress_func_errors)
439{
440 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
441 suppress_func_errors);
442}
443
444/*
445 * Send a packet through the appropriate TLB functional port.
446 * If cu_id=n_cu, then this is the dispatcher's TLB.
447 * Otherwise it's the TLB of the cu_id compute unit.
448 */
449void
451{
452 // update senderState. Need to know the gpuTc and the TLB mode
453 pkt->senderState =
454 new GpuTranslationState(mode, gpuTc, false);
455
456 // even when the perLaneTLB flag is turned on
457 // it's ok tp send all accesses through lane 0
458 // since the lane # is not known here,
459 // This isn't important since these are functional accesses.
460 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
461
462 /* safe_cast the senderState */
463 GpuTranslationState *sender_state =
464 safe_cast<GpuTranslationState*>(pkt->senderState);
465
466 delete sender_state->tlbEntry;
467 delete pkt->senderState;
468}
469
470/*
471 * allow the shader to sample stats from constituent devices
472 */
473void
474Shader::sampleStore(const Tick accessTime)
475{
476 stats.storeLatencyDist.sample(accessTime);
477 stats.allLatencyDist.sample(accessTime);
478}
479
480/*
481 * allow the shader to sample stats from constituent devices
482 */
483void
484Shader::sampleLoad(const Tick accessTime)
485{
486 stats.loadLatencyDist.sample(accessTime);
487 stats.allLatencyDist.sample(accessTime);
488}
489
490void
492{
493 // Only sample instructions that go all the way to main memory
494 if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
495 return;
496 }
497
498 Tick t1 = roundTripTime[0];
499 Tick t2 = roundTripTime[1];
500 Tick t3 = roundTripTime[2];
501 Tick t4 = roundTripTime[3];
502 Tick t5 = roundTripTime[4];
503
508}
509
510void
512{
513 stats.coalsrLineAddresses.sample(lineMap.size());
514 std::vector<Tick> netTimes;
515
516 // For each cache block address generated by a vmem inst, calculate
517 // the round-trip time for that cache block.
518 for (auto& it : lineMap) {
519 const std::vector<Tick>& timeVec = it.second;
520 if (timeVec.size() == 2) {
521 netTimes.push_back(timeVec[1] - timeVec[0]);
522 }
523 }
524
525 // Sort the cache block round trip times so that the first
526 // distrubtion is always measuring the fastests and the last
527 // distrubtion is always measuring the slowest cache block.
528 std::sort(netTimes.begin(), netTimes.end());
529
530 // Sample the round trip time for each N cache blocks into the
531 // Nth distribution.
532 int idx = 0;
533 for (auto& time : netTimes) {
535 ++idx;
536 }
537}
538
539void
541 // If all CUs attached to his shader are asleep, update shaderActiveTicks
542 panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
543 "Invalid activeCu size\n");
544 _activeCus--;
545 if (!_activeCus) {
547
549 kernelExitRequested = false;
550 if (blitKernel) {
551 exitSimLoop("GPU Blit Kernel Completed");
552 } else {
553 exitSimLoop("GPU Kernel Completed");
554 }
555 }
556 }
557}
558
559void
561{
563
564 if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
565 for (auto &dispatch : deferred_dispatches) {
566 gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
567 std::get<1>(dispatch),
568 std::get<2>(dispatch));
569 }
570 deferred_dispatches.clear();
571 }
572}
573
574void
575Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
576 Addr host_pkt_addr)
577{
578 deferred_dispatches.push_back(
579 std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
580}
581
590
592 : statistics::Group(parent),
593 ADD_STAT(allLatencyDist, "delay distribution for all"),
594 ADD_STAT(loadLatencyDist, "delay distribution for loads"),
595 ADD_STAT(storeLatencyDist, "delay distribution for stores"),
596 ADD_STAT(initToCoalesceLatency,
597 "Ticks from vmem inst initiateAcc to coalescer issue"),
598 ADD_STAT(rubyNetworkLatency,
599 "Ticks from coalescer issue to coalescer hit callback"),
600 ADD_STAT(gmEnqueueLatency,
601 "Ticks from coalescer hit callback to GM pipe enqueue"),
602 ADD_STAT(gmToCompleteLatency,
603 "Ticks queued in GM pipes ordered response buffer"),
604 ADD_STAT(coalsrLineAddresses,
605 "Number of cache lines for coalesced request"),
606 ADD_STAT(shaderActiveTicks,
607 "Total ticks that any CU attached to this shader is active"),
608 ADD_STAT(vectorInstSrcOperand,
609 "vector instruction source operand distribution"),
610 ADD_STAT(vectorInstDstOperand,
611 "vector instruction destination operand distribution")
612{
614 .init(0, 1600000-1, 10000)
616
618 .init(0, 1600000-1, 10000)
620
622 .init(0, 1600000-1, 10000)
624
626 .init(0, 1600000-1, 10000)
628
630 .init(0, 1600000-1, 10000)
632
634 .init(0, 1600000-1, 10000)
636
638 .init(0, 1600000-1, 10000)
640
642 .init(0, 20, 1)
644
647
649 for (int idx = 0; idx < wf_size; ++idx) {
650 std::stringstream namestr;
651 ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
652 static_cast<Shader*>(parent)->name(), idx);
654 .init(0, 1600000-1, 10000)
655 .name(namestr.str())
656 .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
658 }
659}
660
661} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
Declaration and inline definition of ChunkGenerator object.
const char data[]
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition base.hh:315
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
virtual std::string name() const
Definition named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
void setSuppressFuncError()
Definition packet.hh:757
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition process.hh:146
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition process.cc:317
std::shared_ptr< MemState > memState
Definition process.hh:289
bool kernelExitRequested
Definition shader.hh:102
Addr mmap(int length)
Definition shader.cc:117
void prepareInvalidate(HSAQueueEntry *task)
Definition shader.cc:203
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition shader.cc:399
bool blitKernel
Definition shader.hh:105
void notifyCuSleep()
Definition shader.cc:540
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition shader.cc:307
void execScheduledAdds()
Definition shader.cc:173
EventFunctionWrapper tickEvent
Definition shader.hh:230
std::vector< ComputeUnit * > cuList
Definition shader.hh:268
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:575
int nextSchedCu
Definition shader.hh:255
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:380
GPUDispatcher & _dispatcher
Definition shader.hh:271
uint32_t sa_n
Definition shader.hh:258
ShaderParams Params
Definition shader.hh:113
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:511
std::vector< uint64_t > sa_when
Definition shader.hh:263
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition shader.cc:150
std::vector< int32_t > sa_x
Definition shader.hh:265
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:417
gem5::Shader::ShaderStats stats
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
Definition shader.hh:110
ThreadContext * gpuTc
Definition shader.hh:124
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition shader.cc:258
GPUDispatcher & dispatcher()
Definition shader.cc:111
Shader(const Params &p)
Definition shader.cc:57
void decNumOutstandingInvL2s()
Definition shader.cc:560
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:586
void updateContext(int cid)
Definition shader.cc:165
int n_cu_per_sqc
Definition shader.hh:249
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:431
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition shader.cc:245
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:491
void sampleLoad(const Tick accessTime)
Definition shader.cc:484
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition shader.cc:450
int num_outstanding_invl2s
Definition shader.hh:109
void sampleStore(const Tick accessTime)
Definition shader.cc:474
BaseCPU * cpuPointer
Definition shader.hh:125
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:270
Tick _lastInactiveTick
Definition shader.hh:98
std::vector< int * > sa_val
Definition shader.hh:261
int _activeCus
Definition shader.hh:95
virtual Process * getProcessPtr()=0
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
A simple distribution stat.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
STL vector class.
Definition stl.hh:37
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:260
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
void reschedule(Event &event, Tick when, bool always=false)
Definition eventq.hh:1030
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
Tick when() const
Get the time that the event is scheduled.
Definition eventq.hh:501
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Bitfield< 4 > t4
Bitfield< 2 > t2
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 3 > t3
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 5 > t5
Bitfield< 1 > t1
Bitfield< 0 > p
Bitfield< 3 > x
Definition pagetable.hh:74
Bitfield< 63 > val
Definition misc.hh:804
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ InstMemoryHopMax
Definition misc.hh:58
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
void ccprintf(cp::Print &print)
Definition cprintf.hh:130
@ HW_REG_SH_MEM_BASES
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
Definition shader.hh:375
statistics::Distribution storeLatencyDist
Definition shader.hh:353
statistics::Distribution initToCoalesceLatency
Definition shader.hh:356
statistics::Scalar shaderActiveTicks
Definition shader.hh:374
statistics::Distribution loadLatencyDist
Definition shader.hh:352
statistics::Distribution allLatencyDist
Definition shader.hh:351
statistics::Distribution gmToCompleteLatency
Definition shader.hh:365
ShaderStats(statistics::Group *parent, int wf_size)
Definition shader.cc:591
statistics::Distribution coalsrLineAddresses
Definition shader.hh:368
statistics::Vector vectorInstDstOperand
Definition shader.hh:376
statistics::Distribution rubyNetworkLatency
Definition shader.hh:359
statistics::Distribution * cacheBlockRoundTrip
Definition shader.hh:372
statistics::Distribution gmEnqueueLatency
Definition shader.hh:362

Generated on Mon Jan 13 2025 04:28:36 for gem5 by doxygen 1.9.8