gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
shader.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "gpu-compute/shader.hh"
33
34#include <limits>
35
39#include "debug/GPUAgentDisp.hh"
40#include "debug/GPUDisp.hh"
41#include "debug/GPUMem.hh"
42#include "debug/GPUShader.hh"
43#include "debug/GPUWgLatency.hh"
50#include "mem/packet.hh"
52#include "sim/sim_exit.hh"
53
54namespace gem5
55{
56
58 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
59 gpuTc(nullptr), cpuPointer(p.cpu_pointer),
60 tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
61 false, Event::CPU_Tick_Pri),
62 timingSim(p.timing), hsail_mode(SIMT),
63 impl_kern_launch_acq(p.impl_kern_launch_acq),
64 impl_kern_end_rel(p.impl_kern_end_rel),
65 coissue_return(1),
66 trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
67 n_cu_per_sqc(p.cu_per_sqc),
68 globalMemSize(p.globalmem),
69 nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
70 _dispatcher(*p.dispatcher), systemHub(p.system_hub),
71 max_valu_insts(p.max_valu_insts), total_valu_insts(0),
72 progressInterval(p.progress_interval),
73 stats(this, p.CUs[0]->wfSize())
74{
75 gpuCmdProc.setShader(this);
76 _dispatcher.setShader(this);
77
78 // These apertures are set by the driver. In full system mode that is done
79 // using a PM4 packet but the emulated SE mode driver does not set them
80 // explicitly, so we need to define some reasonable defaults here.
81 _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
82 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
83
84 _ldsApe.base = 0x1000000000000;
85 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
86
87 _scratchApe.base = 0x2000000000000;
88 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
89
90 // The scratch and LDS address can be queried starting in gfx900. The
91 // base addresses are in the SH_MEM_BASES 32-bit register. The upper 16
92 // bits are for the LDS address and the lower 16 bits are for scratch
93 // address. In both cases the 16 bits represent bits 63:48 of the address.
94 // This means bits 47:0 of the base address is always zero.
95 setHwReg(HW_REG_SH_MEM_BASES, 0x00010002);
96
97 shHiddenPrivateBaseVmid = 0;
98
99 cuList.resize(n_cu);
100
101 panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
102
103 for (int i = 0; i < n_cu; ++i) {
104 cuList[i] = p.CUs[i];
105 assert(i == cuList[i]->cu_id);
106 cuList[i]->shader = this;
107 cuList[i]->idleCUTimeout = p.idlecu_timeout;
108 }
109}
110
113{
114 return _dispatcher;
115}
116
117Addr
118Shader::mmap(int length)
119{
120
121 Addr start;
122
123 // round up length to the next page
124 length = roundUp(length, X86ISA::PageBytes);
125
126 Process *proc = gpuTc->getProcessPtr();
127 auto mem_state = proc->memState;
128
129 if (proc->mmapGrowsDown()) {
130 DPRINTF(GPUShader, "GROWS DOWN");
131 start = mem_state->getMmapEnd() - length;
132 mem_state->setMmapEnd(start);
133 } else {
134 DPRINTF(GPUShader, "GROWS UP");
135 start = mem_state->getMmapEnd();
136 mem_state->setMmapEnd(start + length);
137
138 // assertion to make sure we don't overwrite the stack (it grows down)
139 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
140 mem_state->getMmapEnd());
141 }
142
143 DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
144
145 proc->allocateMem(start, length);
146
147 return start;
148}
149
150void
152{
153 // grab the threadContext of the thread running on the CPU
154 assert(cpuPointer);
155 gpuTc = cpuPointer->getContext(0);
156 assert(gpuTc);
157}
158
160{
161 for (int j = 0; j < n_cu; ++j)
162 delete cuList[j];
163}
164
165void
167 // context of the thread which dispatched work
168 assert(cpuPointer);
169 gpuTc = cpuPointer->getContext(cid);
170 assert(gpuTc);
171}
172
173void
175{
176 assert(!sa_when.empty());
177
178 // apply any scheduled adds
179 for (int i = 0; i < sa_n; ++i) {
180 if (sa_when[i] <= curTick()) {
181 *sa_val[i] += sa_x[i];
182 panic_if(*sa_val[i] < 0, "Negative counter value\n");
183 sa_val.erase(sa_val.begin() + i);
184 sa_x.erase(sa_x.begin() + i);
185 sa_when.erase(sa_when.begin() + i);
186 --sa_n;
187 --i;
188 }
189 }
190 if (!sa_when.empty()) {
191 Tick shader_wakeup = *std::max_element(sa_when.begin(),
192 sa_when.end());
193 DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
194 schedule(tickEvent, shader_wakeup);
195 } else {
196 DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
197 }
198}
199
200/*
201 * dispatcher/shader arranges invalidate requests to the CUs
202 */
203void
205 // if invalidate has already started/finished, then do nothing
206 if (task->isInvStarted()) return;
207
208 // invalidate has never started; it can only perform once at kernel launch
209 assert(task->outstandingInvs() == -1);
210 int kernId = task->dispatchId();
211 // counter value is 0 now, indicating the inv is about to start
212 _dispatcher.updateInvCounter(kernId, +1);
213
214 // iterate all cus managed by the shader, to perform invalidate.
215 for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
216 // create a request to hold INV info; the request's fields will
217 // be updated in cu before use
218 auto tcc_req = std::make_shared<Request>(0, 0, 0,
219 cuList[i_cu]->requestorId(),
220 0, -1);
221
222 _dispatcher.updateInvCounter(kernId, +1);
223 // all necessary INV flags are all set now, call cu to execute
224 cuList[i_cu]->doInvalidate(tcc_req, task->dispatchId());
225
226
227 // A set of CUs share a single SQC cache. Send a single invalidate
228 // request to each SQC
229 auto sqc_req = std::make_shared<Request>(0, 0, 0,
230 cuList[i_cu]->requestorId(),
231 0, -1);
232
233 if ((i_cu % n_cu_per_sqc) == 0) {
234 cuList[i_cu]->doSQCInvalidate(sqc_req, task->dispatchId());
235 }
236
237 // I don't like this. This is intrusive coding.
238 cuList[i_cu]->resetRegisterPool();
239 }
240}
241
245void
247 int kernId = gpuDynInst->kern_id;
248 // flush has never been started, performed only once at kernel end
249 assert(_dispatcher.getOutstandingWbs(kernId) == 0);
250
251 // the first cu, managed by the shader, performs flush operation,
252 // assuming that L2 cache is shared by all cus in the shader
253 int i_cu = 0;
254 _dispatcher.updateWbCounter(kernId, +1);
255 cuList[i_cu]->doFlush(gpuDynInst);
256}
257
258bool
260{
261 bool scheduledSomething = false;
262 int cuCount = 0;
263 int curCu = nextSchedCu;
264 int disp_count(0);
265
266 while (cuCount < n_cu) {
267 //Every time we try a CU, update nextSchedCu
268 nextSchedCu = (nextSchedCu + 1) % n_cu;
269
270 // dispatch workgroup iff the following two conditions are met:
271 // (a) wg_rem is true - there are unassigned workgroups in the grid
272 // (b) there are enough free slots in cu cuList[i] for this wg
273 int num_wfs_in_wg = 0;
274 bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
275 if (!task->dispComplete() && can_disp) {
276 scheduledSomething = true;
277 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
278 curCu, task->globalWgId());
279 DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
280 curCu, task->globalWgId());
281 DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
282 curTick(), task->globalWgId(), curCu);
283
284 if (!cuList[curCu]->tickEvent.scheduled()) {
285 if (!_activeCus)
287 _activeCus++;
288 }
289
291 "Invalid activeCu size\n");
292 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
293
294 task->markWgDispatch();
295 ++disp_count;
296 }
297
298 ++cuCount;
299 curCu = nextSchedCu;
300 }
301
302 DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
303
304 return scheduledSomething;
305}
306
307void
309 bool suppress_func_errors, int cu_id)
310{
311 int block_size = cuList.at(cu_id)->cacheLineSize();
312 unsigned size = req->getSize();
313
314 Addr tmp_addr;
315 BaseMMU::Mode trans_mode;
316
317 if (cmd == MemCmd::ReadReq) {
318 trans_mode = BaseMMU::Read;
319 } else if (cmd == MemCmd::WriteReq) {
320 trans_mode = BaseMMU::Write;
321 } else {
322 fatal("unexcepted MemCmd\n");
323 }
324
325 tmp_addr = req->getVaddr();
326 Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
327
328 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
329
330 // Misaligned access
331 if (split_addr > tmp_addr) {
332 RequestPtr req1, req2;
333 req->splitOnVaddr(split_addr, req1, req2);
334
335 PacketPtr pkt1 = new Packet(req2, cmd);
336 PacketPtr pkt2 = new Packet(req1, cmd);
337
338 functionalTLBAccess(pkt1, cu_id, trans_mode);
339 functionalTLBAccess(pkt2, cu_id, trans_mode);
340
341 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
342 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
343
344 new_pkt1->dataStatic(data);
345 new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
346
347 if (suppress_func_errors) {
348 new_pkt1->setSuppressFuncError();
349 new_pkt2->setSuppressFuncError();
350 }
351
352 // fixme: this should be cuList[cu_id] if cu_id != n_cu
353 // The latter requires a memPort in the dispatcher
354 cuList[0]->memPort[0].sendFunctional(new_pkt1);
355 cuList[0]->memPort[0].sendFunctional(new_pkt2);
356
357 delete new_pkt1;
358 delete new_pkt2;
359 delete pkt1;
360 delete pkt2;
361 } else {
362 PacketPtr pkt = new Packet(req, cmd);
363 functionalTLBAccess(pkt, cu_id, trans_mode);
364 PacketPtr new_pkt = new Packet(pkt->req, cmd);
365 new_pkt->dataStatic(data);
366
367 if (suppress_func_errors) {
368 new_pkt->setSuppressFuncError();
369 };
370
371 // fixme: this should be cuList[cu_id] if cu_id != n_cu
372 // The latter requires a memPort in the dispatcher
373 cuList[0]->memPort[0].sendFunctional(new_pkt);
374
375 delete new_pkt;
376 delete pkt;
377 }
378}
379
380void
382{
383 sa_val.push_back(val);
384 when += curTick();
385 sa_when.push_back(when);
386 sa_x.push_back(x);
387 ++sa_n;
388 if (!tickEvent.scheduled() || (when < tickEvent.when())) {
389 DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
390 "%lu\n", when);
391 reschedule(tickEvent, when, true);
392 } else {
393 assert(tickEvent.scheduled());
394 DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
395 "%lu\n", when);
396 }
397}
398
399void
400Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
401 MemCmd cmd, bool suppress_func_errors)
402{
403 uint8_t *data_buf = (uint8_t*)ptr;
404
405 for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
406 !gen.done(); gen.next()) {
407
408 RequestPtr req = std::make_shared<Request>(
409 gen.addr(), gen.size(), 0,
410 cuList[0]->requestorId(), 0, 0, nullptr);
411
412 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
413 data_buf += gen.size();
414 }
415}
416
417void
418Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
419{
420 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
421}
422
423void
424Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
425 bool suppress_func_errors)
426{
427 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
428 suppress_func_errors);
429}
430
431void
432Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
433{
434 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
435}
436
437void
438Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
439 bool suppress_func_errors)
440{
441 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
442 suppress_func_errors);
443}
444
445/*
446 * Send a packet through the appropriate TLB functional port.
447 * If cu_id=n_cu, then this is the dispatcher's TLB.
448 * Otherwise it's the TLB of the cu_id compute unit.
449 */
450void
452{
453 // update senderState. Need to know the gpuTc and the TLB mode
454 pkt->senderState =
455 new GpuTranslationState(mode, gpuTc, false);
456
457 // even when the perLaneTLB flag is turned on
458 // it's ok tp send all accesses through lane 0
459 // since the lane # is not known here,
460 // This isn't important since these are functional accesses.
461 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
462
463 /* safe_cast the senderState */
464 GpuTranslationState *sender_state =
466
467 delete sender_state->tlbEntry;
468 delete pkt->senderState;
469}
470
471/*
472 * allow the shader to sample stats from constituent devices
473 */
474void
475Shader::sampleStore(const Tick accessTime)
476{
477 stats.storeLatencyDist.sample(accessTime);
478 stats.allLatencyDist.sample(accessTime);
479}
480
481/*
482 * allow the shader to sample stats from constituent devices
483 */
484void
485Shader::sampleLoad(const Tick accessTime)
486{
487 stats.loadLatencyDist.sample(accessTime);
488 stats.allLatencyDist.sample(accessTime);
489}
490
491void
493{
494 // Only sample instructions that go all the way to main memory
495 if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
496 return;
497 }
498
499 Tick t1 = roundTripTime[0];
500 Tick t2 = roundTripTime[1];
501 Tick t3 = roundTripTime[2];
502 Tick t4 = roundTripTime[3];
503 Tick t5 = roundTripTime[4];
504
505 stats.initToCoalesceLatency.sample(t2-t1);
506 stats.rubyNetworkLatency.sample(t3-t2);
507 stats.gmEnqueueLatency.sample(t4-t3);
508 stats.gmToCompleteLatency.sample(t5-t4);
509}
510
511void
513{
514 stats.coalsrLineAddresses.sample(lineMap.size());
515 std::vector<Tick> netTimes;
516
517 // For each cache block address generated by a vmem inst, calculate
518 // the round-trip time for that cache block.
519 for (auto& it : lineMap) {
520 const std::vector<Tick>& timeVec = it.second;
521 if (timeVec.size() == 2) {
522 netTimes.push_back(timeVec[1] - timeVec[0]);
523 }
524 }
525
526 // Sort the cache block round trip times so that the first
527 // distrubtion is always measuring the fastests and the last
528 // distrubtion is always measuring the slowest cache block.
529 std::sort(netTimes.begin(), netTimes.end());
530
531 // Sample the round trip time for each N cache blocks into the
532 // Nth distribution.
533 int idx = 0;
534 for (auto& time : netTimes) {
535 stats.cacheBlockRoundTrip[idx].sample(time);
536 ++idx;
537 }
538}
539
540void
542 // If all CUs attached to his shader are asleep, update shaderActiveTicks
544 "Invalid activeCu size\n");
545 _activeCus--;
546 if (!_activeCus) {
547 stats.shaderActiveTicks += curTick() - _lastInactiveTick;
548
550 kernelExitRequested = false;
551 if (blitKernel) {
552 exitSimLoop("GPU Blit Kernel Completed");
553 } else {
554 exitSimLoop("GPU Kernel Completed");
555 }
556 }
557 }
558}
559
560void
562{
564
565 if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
566 for (auto &dispatch : deferred_dispatches) {
567 gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
568 std::get<1>(dispatch),
569 std::get<2>(dispatch));
570 }
571 deferred_dispatches.clear();
572 }
573}
574
575void
576Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
577 Addr host_pkt_addr)
578{
579 deferred_dispatches.push_back(
580 std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
581}
582
588{
589 return gpuCmdProc.vramRequestorId();
590}
591
592GfxVersion
594{
595 return gpuCmdProc.getGfxVersion();
596}
597
599 : statistics::Group(parent),
600 ADD_STAT(allLatencyDist, "delay distribution for all"),
601 ADD_STAT(loadLatencyDist, "delay distribution for loads"),
602 ADD_STAT(storeLatencyDist, "delay distribution for stores"),
604 "Ticks from vmem inst initiateAcc to coalescer issue"),
606 "Ticks from coalescer issue to coalescer hit callback"),
608 "Ticks from coalescer hit callback to GM pipe enqueue"),
610 "Ticks queued in GM pipes ordered response buffer"),
612 "Number of cache lines for coalesced request"),
614 "Total ticks that any CU attached to this shader is active"),
616 "vector instruction source operand distribution"),
618 "vector instruction destination operand distribution")
619{
621 .init(0, 1600000-1, 10000)
623
625 .init(0, 1600000-1, 10000)
627
629 .init(0, 1600000-1, 10000)
631
633 .init(0, 1600000-1, 10000)
635
637 .init(0, 1600000-1, 10000)
639
641 .init(0, 1600000-1, 10000)
643
645 .init(0, 1600000-1, 10000)
647
649 .init(0, 20, 1)
651
652 vectorInstSrcOperand.init(4);
653 vectorInstDstOperand.init(4);
654
656 for (int idx = 0; idx < wf_size; ++idx) {
657 std::stringstream namestr;
658 ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
659 static_cast<Shader*>(parent)->name(), idx);
661 .init(0, 1600000-1, 10000)
662 .name(namestr.str())
663 .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
665 }
666}
667
668} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
Declaration and inline definition of ChunkGenerator object.
const char data[]
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
ClockedObject(const ClockedObjectParams &p)
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
virtual std::string name() const
Definition named.hh:60
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
void setSuppressFuncError()
Definition packet.hh:757
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd?
Definition process.hh:152
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition process.cc:318
std::shared_ptr< MemState > memState
Definition process.hh:301
bool kernelExitRequested
Definition shader.hh:102
Addr mmap(int length)
Definition shader.cc:118
void prepareInvalidate(HSAQueueEntry *task)
Definition shader.cc:204
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition shader.cc:400
bool blitKernel
Definition shader.hh:105
void notifyCuSleep()
Definition shader.cc:541
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition shader.cc:308
void execScheduledAdds()
Definition shader.cc:174
GfxVersion getGfxVersion() const
Definition shader.cc:593
EventFunctionWrapper tickEvent
Definition shader.hh:231
std::vector< ComputeUnit * > cuList
Definition shader.hh:269
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:576
int nextSchedCu
Definition shader.hh:256
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:381
GPUDispatcher & _dispatcher
Definition shader.hh:272
uint32_t sa_n
Definition shader.hh:259
ShaderParams Params
Definition shader.hh:113
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:512
std::vector< uint64_t > sa_when
Definition shader.hh:264
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition shader.cc:151
std::vector< int32_t > sa_x
Definition shader.hh:266
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:418
gem5::Shader::ShaderStats stats
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
Definition shader.hh:110
ThreadContext * gpuTc
Definition shader.hh:124
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition shader.cc:259
GPUDispatcher & dispatcher()
Definition shader.cc:112
Shader(const Params &p)
Definition shader.cc:57
void decNumOutstandingInvL2s()
Definition shader.cc:561
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:587
SimpleThread * cpuThread
Definition shader.hh:123
void updateContext(int cid)
Definition shader.cc:166
int n_cu_per_sqc
Definition shader.hh:250
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:432
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition shader.cc:246
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:492
void sampleLoad(const Tick accessTime)
Definition shader.cc:485
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition shader.cc:451
int num_outstanding_invl2s
Definition shader.hh:109
void sampleStore(const Tick accessTime)
Definition shader.cc:475
BaseCPU * cpuPointer
Definition shader.hh:125
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:271
Tick _lastInactiveTick
Definition shader.hh:98
std::vector< int * > sa_val
Definition shader.hh:262
int _activeCus
Definition shader.hh:95
A simple distribution stat.
Statistics container.
Definition group.hh:93
STL vector class.
Definition stl.hh:37
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:260
bool done() const
Are we done?
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
void reschedule(Event &event, Tick when, bool always=false)
Definition eventq.hh:1030
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Bitfield< 4 > t4
Bitfield< 2 > t2
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 3 > t3
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 5 > t5
Bitfield< 1 > t1
Bitfield< 0 > p
Bitfield< 3 > x
Definition pagetable.hh:76
Bitfield< 63 > val
Definition misc.hh:804
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ InstMemoryHopMax
Definition misc.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
Packet * PacketPtr
void ccprintf(cp::Print &print)
Definition cprintf.hh:130
@ HW_REG_SH_MEM_BASES
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
Definition shader.hh:380
statistics::Distribution storeLatencyDist
Definition shader.hh:358
statistics::Distribution initToCoalesceLatency
Definition shader.hh:361
statistics::Scalar shaderActiveTicks
Definition shader.hh:379
statistics::Distribution loadLatencyDist
Definition shader.hh:357
statistics::Distribution allLatencyDist
Definition shader.hh:356
statistics::Distribution gmToCompleteLatency
Definition shader.hh:370
ShaderStats(statistics::Group *parent, int wf_size)
Definition shader.cc:598
statistics::Distribution coalsrLineAddresses
Definition shader.hh:373
statistics::Vector vectorInstDstOperand
Definition shader.hh:381
statistics::Distribution rubyNetworkLatency
Definition shader.hh:364
statistics::Distribution * cacheBlockRoundTrip
Definition shader.hh:377
statistics::Distribution gmEnqueueLatency
Definition shader.hh:367

Generated on Mon May 26 2025 09:19:11 for gem5 by doxygen 1.13.2