gem5 v24.0.0.0
Loading...
Searching...
No Matches
shader.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "gpu-compute/shader.hh"
33
34#include <limits>
35
39#include "debug/GPUAgentDisp.hh"
40#include "debug/GPUDisp.hh"
41#include "debug/GPUMem.hh"
42#include "debug/GPUShader.hh"
43#include "debug/GPUWgLatency.hh"
50#include "mem/packet.hh"
52#include "sim/sim_exit.hh"
53
54namespace gem5
55{
56
58 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
59 gpuTc(nullptr), cpuPointer(p.cpu_pointer),
60 tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
61 false, Event::CPU_Tick_Pri),
62 timingSim(p.timing), hsail_mode(SIMT),
63 impl_kern_launch_acq(p.impl_kern_launch_acq),
64 impl_kern_end_rel(p.impl_kern_end_rel),
65 coissue_return(1),
66 trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
67 n_cu_per_sqc(p.cu_per_sqc),
68 globalMemSize(p.globalmem),
69 nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
70 _dispatcher(*p.dispatcher), systemHub(p.system_hub),
71 max_valu_insts(p.max_valu_insts), total_valu_insts(0),
72 stats(this, p.CUs[0]->wfSize())
73{
74 gpuCmdProc.setShader(this);
75 _dispatcher.setShader(this);
76
77 // These apertures are set by the driver. In full system mode that is done
78 // using a PM4 packet but the emulated SE mode driver does not set them
79 // explicitly, so we need to define some reasonable defaults here.
80 _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
81 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
82
83 _ldsApe.base = 0x1000000000000;
84 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
85
86 _scratchApe.base = 0x2000000000000;
87 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
88
89 // The scratch and LDS address can be queried starting in gfx900. The
90 // base addresses are in the SH_MEM_BASES 32-bit register. The upper 16
91 // bits are for the LDS address and the lower 16 bits are for scratch
92 // address. In both cases the 16 bits represent bits 63:48 of the address.
93 // This means bits 47:0 of the base address is always zero.
94 setHwReg(HW_REG_SH_MEM_BASES, 0x00010002);
95
96 shHiddenPrivateBaseVmid = 0;
97
98 cuList.resize(n_cu);
99
100 panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
101
102 for (int i = 0; i < n_cu; ++i) {
103 cuList[i] = p.CUs[i];
104 assert(i == cuList[i]->cu_id);
105 cuList[i]->shader = this;
106 cuList[i]->idleCUTimeout = p.idlecu_timeout;
107 }
108}
109
110GPUDispatcher&
112{
113 return _dispatcher;
114}
115
116Addr
117Shader::mmap(int length)
118{
119
120 Addr start;
121
122 // round up length to the next page
123 length = roundUp(length, X86ISA::PageBytes);
124
125 Process *proc = gpuTc->getProcessPtr();
126 auto mem_state = proc->memState;
127
128 if (proc->mmapGrowsDown()) {
129 DPRINTF(GPUShader, "GROWS DOWN");
130 start = mem_state->getMmapEnd() - length;
131 mem_state->setMmapEnd(start);
132 } else {
133 DPRINTF(GPUShader, "GROWS UP");
134 start = mem_state->getMmapEnd();
135 mem_state->setMmapEnd(start + length);
136
137 // assertion to make sure we don't overwrite the stack (it grows down)
138 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
139 mem_state->getMmapEnd());
140 }
141
142 DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
143
144 proc->allocateMem(start, length);
145
146 return start;
147}
148
149void
151{
152 // grab the threadContext of the thread running on the CPU
153 assert(cpuPointer);
155 assert(gpuTc);
156}
157
159{
160 for (int j = 0; j < n_cu; ++j)
161 delete cuList[j];
162}
163
164void
166 // context of the thread which dispatched work
167 assert(cpuPointer);
169 assert(gpuTc);
170}
171
172void
174{
175 assert(!sa_when.empty());
176
177 // apply any scheduled adds
178 for (int i = 0; i < sa_n; ++i) {
179 if (sa_when[i] <= curTick()) {
180 *sa_val[i] += sa_x[i];
181 panic_if(*sa_val[i] < 0, "Negative counter value\n");
182 sa_val.erase(sa_val.begin() + i);
183 sa_x.erase(sa_x.begin() + i);
184 sa_when.erase(sa_when.begin() + i);
185 --sa_n;
186 --i;
187 }
188 }
189 if (!sa_when.empty()) {
190 Tick shader_wakeup = *std::max_element(sa_when.begin(),
191 sa_when.end());
192 DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
193 schedule(tickEvent, shader_wakeup);
194 } else {
195 DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
196 }
197}
198
199/*
200 * dispatcher/shader arranges invalidate requests to the CUs
201 */
202void
204 // if invalidate has already started/finished, then do nothing
205 if (task->isInvStarted()) return;
206
207 // invalidate has never started; it can only perform once at kernel launch
208 assert(task->outstandingInvs() == -1);
209 int kernId = task->dispatchId();
210 // counter value is 0 now, indicating the inv is about to start
211 _dispatcher.updateInvCounter(kernId, +1);
212
213 // iterate all cus managed by the shader, to perform invalidate.
214 for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
215 // create a request to hold INV info; the request's fields will
216 // be updated in cu before use
217 auto req = std::make_shared<Request>(0, 0, 0,
218 cuList[i_cu]->requestorId(),
219 0, -1);
220
221 _dispatcher.updateInvCounter(kernId, +1);
222 // all necessary INV flags are all set now, call cu to execute
223 cuList[i_cu]->doInvalidate(req, task->dispatchId());
224
225
226 // A set of CUs share a single SQC cache. Send a single invalidate
227 // request to each SQC
228 if ((i_cu % n_cu_per_sqc) == 0) {
229 cuList[i_cu]->doSQCInvalidate(req, task->dispatchId());
230 }
231
232 // I don't like this. This is intrusive coding.
233 cuList[i_cu]->resetRegisterPool();
234 }
235}
236
240void
242 int kernId = gpuDynInst->kern_id;
243 // flush has never been started, performed only once at kernel end
244 assert(_dispatcher.getOutstandingWbs(kernId) == 0);
245
246 // the first cu, managed by the shader, performs flush operation,
247 // assuming that L2 cache is shared by all cus in the shader
248 int i_cu = 0;
249 _dispatcher.updateWbCounter(kernId, +1);
250 cuList[i_cu]->doFlush(gpuDynInst);
251}
252
253bool
255{
256 bool scheduledSomething = false;
257 int cuCount = 0;
258 int curCu = nextSchedCu;
259 int disp_count(0);
260
261 while (cuCount < n_cu) {
262 //Every time we try a CU, update nextSchedCu
263 nextSchedCu = (nextSchedCu + 1) % n_cu;
264
265 // dispatch workgroup iff the following two conditions are met:
266 // (a) wg_rem is true - there are unassigned workgroups in the grid
267 // (b) there are enough free slots in cu cuList[i] for this wg
268 int num_wfs_in_wg = 0;
269 bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
270 if (!task->dispComplete() && can_disp) {
271 scheduledSomething = true;
272 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
273 curCu, task->globalWgId());
274 DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
275 curCu, task->globalWgId());
276 DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
277 curTick(), task->globalWgId(), curCu);
278
279 if (!cuList[curCu]->tickEvent.scheduled()) {
280 if (!_activeCus)
282 _activeCus++;
283 }
284
286 "Invalid activeCu size\n");
287 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
288
289 task->markWgDispatch();
290 ++disp_count;
291 }
292
293 ++cuCount;
294 curCu = nextSchedCu;
295 }
296
297 DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
298
299 return scheduledSomething;
300}
301
302void
304 bool suppress_func_errors, int cu_id)
305{
306 int block_size = cuList.at(cu_id)->cacheLineSize();
307 unsigned size = req->getSize();
308
309 Addr tmp_addr;
310 BaseMMU::Mode trans_mode;
311
312 if (cmd == MemCmd::ReadReq) {
313 trans_mode = BaseMMU::Read;
314 } else if (cmd == MemCmd::WriteReq) {
315 trans_mode = BaseMMU::Write;
316 } else {
317 fatal("unexcepted MemCmd\n");
318 }
319
320 tmp_addr = req->getVaddr();
321 Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
322
323 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
324
325 // Misaligned access
326 if (split_addr > tmp_addr) {
327 RequestPtr req1, req2;
328 req->splitOnVaddr(split_addr, req1, req2);
329
330 PacketPtr pkt1 = new Packet(req2, cmd);
331 PacketPtr pkt2 = new Packet(req1, cmd);
332
333 functionalTLBAccess(pkt1, cu_id, trans_mode);
334 functionalTLBAccess(pkt2, cu_id, trans_mode);
335
336 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
337 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
338
339 new_pkt1->dataStatic(data);
340 new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
341
342 if (suppress_func_errors) {
343 new_pkt1->setSuppressFuncError();
344 new_pkt2->setSuppressFuncError();
345 }
346
347 // fixme: this should be cuList[cu_id] if cu_id != n_cu
348 // The latter requires a memPort in the dispatcher
349 cuList[0]->memPort[0].sendFunctional(new_pkt1);
350 cuList[0]->memPort[0].sendFunctional(new_pkt2);
351
352 delete new_pkt1;
353 delete new_pkt2;
354 delete pkt1;
355 delete pkt2;
356 } else {
357 PacketPtr pkt = new Packet(req, cmd);
358 functionalTLBAccess(pkt, cu_id, trans_mode);
359 PacketPtr new_pkt = new Packet(pkt->req, cmd);
360 new_pkt->dataStatic(data);
361
362 if (suppress_func_errors) {
363 new_pkt->setSuppressFuncError();
364 };
365
366 // fixme: this should be cuList[cu_id] if cu_id != n_cu
367 // The latter requires a memPort in the dispatcher
368 cuList[0]->memPort[0].sendFunctional(new_pkt);
369
370 delete new_pkt;
371 delete pkt;
372 }
373}
374
375void
377{
378 sa_val.push_back(val);
379 when += curTick();
380 sa_when.push_back(when);
381 sa_x.push_back(x);
382 ++sa_n;
383 if (!tickEvent.scheduled() || (when < tickEvent.when())) {
384 DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
385 "%lu\n", when);
386 reschedule(tickEvent, when, true);
387 } else {
388 assert(tickEvent.scheduled());
389 DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
390 "%lu\n", when);
391 }
392}
393
394void
395Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
396 MemCmd cmd, bool suppress_func_errors)
397{
398 uint8_t *data_buf = (uint8_t*)ptr;
399
400 for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
401 !gen.done(); gen.next()) {
402
403 RequestPtr req = std::make_shared<Request>(
404 gen.addr(), gen.size(), 0,
405 cuList[0]->requestorId(), 0, 0, nullptr);
406
407 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
408 data_buf += gen.size();
409 }
410}
411
412void
413Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
414{
415 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
416}
417
418void
419Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
420 bool suppress_func_errors)
421{
422 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
423 suppress_func_errors);
424}
425
426void
427Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
428{
429 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
430}
431
432void
433Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
434 bool suppress_func_errors)
435{
436 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
437 suppress_func_errors);
438}
439
440/*
441 * Send a packet through the appropriate TLB functional port.
442 * If cu_id=n_cu, then this is the dispatcher's TLB.
443 * Otherwise it's the TLB of the cu_id compute unit.
444 */
445void
447{
448 // update senderState. Need to know the gpuTc and the TLB mode
449 pkt->senderState =
450 new GpuTranslationState(mode, gpuTc, false);
451
452 // even when the perLaneTLB flag is turned on
453 // it's ok tp send all accesses through lane 0
454 // since the lane # is not known here,
455 // This isn't important since these are functional accesses.
456 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
457
458 /* safe_cast the senderState */
459 GpuTranslationState *sender_state =
461
462 delete sender_state->tlbEntry;
463 delete pkt->senderState;
464}
465
466/*
467 * allow the shader to sample stats from constituent devices
468 */
469void
470Shader::sampleStore(const Tick accessTime)
471{
472 stats.storeLatencyDist.sample(accessTime);
473 stats.allLatencyDist.sample(accessTime);
474}
475
476/*
477 * allow the shader to sample stats from constituent devices
478 */
479void
480Shader::sampleLoad(const Tick accessTime)
481{
482 stats.loadLatencyDist.sample(accessTime);
483 stats.allLatencyDist.sample(accessTime);
484}
485
486void
488{
489 // Only sample instructions that go all the way to main memory
490 if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
491 return;
492 }
493
494 Tick t1 = roundTripTime[0];
495 Tick t2 = roundTripTime[1];
496 Tick t3 = roundTripTime[2];
497 Tick t4 = roundTripTime[3];
498 Tick t5 = roundTripTime[4];
499
504}
505
506void
508{
509 stats.coalsrLineAddresses.sample(lineMap.size());
510 std::vector<Tick> netTimes;
511
512 // For each cache block address generated by a vmem inst, calculate
513 // the round-trip time for that cache block.
514 for (auto& it : lineMap) {
515 const std::vector<Tick>& timeVec = it.second;
516 if (timeVec.size() == 2) {
517 netTimes.push_back(timeVec[1] - timeVec[0]);
518 }
519 }
520
521 // Sort the cache block round trip times so that the first
522 // distrubtion is always measuring the fastests and the last
523 // distrubtion is always measuring the slowest cache block.
524 std::sort(netTimes.begin(), netTimes.end());
525
526 // Sample the round trip time for each N cache blocks into the
527 // Nth distribution.
528 int idx = 0;
529 for (auto& time : netTimes) {
531 ++idx;
532 }
533}
534
535void
537 // If all CUs attached to his shader are asleep, update shaderActiveTicks
539 "Invalid activeCu size\n");
540 _activeCus--;
541 if (!_activeCus) {
543
545 kernelExitRequested = false;
546 if (blitKernel) {
547 exitSimLoop("GPU Blit Kernel Completed");
548 } else {
549 exitSimLoop("GPU Kernel Completed");
550 }
551 }
552 }
553}
554
555void
557{
559
560 if (num_outstanding_invl2s == 0 && !deferred_dispatches.empty()) {
561 for (auto &dispatch : deferred_dispatches) {
562 gpuCmdProc.submitDispatchPkt(std::get<0>(dispatch),
563 std::get<1>(dispatch),
564 std::get<2>(dispatch));
565 }
566 deferred_dispatches.clear();
567 }
568}
569
570void
571Shader::addDeferredDispatch(void *raw_pkt, uint32_t queue_id,
572 Addr host_pkt_addr)
573{
574 deferred_dispatches.push_back(
575 std::make_tuple(raw_pkt, queue_id, host_pkt_addr));
576}
577
586
588 : statistics::Group(parent),
589 ADD_STAT(allLatencyDist, "delay distribution for all"),
590 ADD_STAT(loadLatencyDist, "delay distribution for loads"),
591 ADD_STAT(storeLatencyDist, "delay distribution for stores"),
592 ADD_STAT(initToCoalesceLatency,
593 "Ticks from vmem inst initiateAcc to coalescer issue"),
594 ADD_STAT(rubyNetworkLatency,
595 "Ticks from coalescer issue to coalescer hit callback"),
596 ADD_STAT(gmEnqueueLatency,
597 "Ticks from coalescer hit callback to GM pipe enqueue"),
598 ADD_STAT(gmToCompleteLatency,
599 "Ticks queued in GM pipes ordered response buffer"),
600 ADD_STAT(coalsrLineAddresses,
601 "Number of cache lines for coalesced request"),
602 ADD_STAT(shaderActiveTicks,
603 "Total ticks that any CU attached to this shader is active"),
604 ADD_STAT(vectorInstSrcOperand,
605 "vector instruction source operand distribution"),
606 ADD_STAT(vectorInstDstOperand,
607 "vector instruction destination operand distribution")
608{
610 .init(0, 1600000-1, 10000)
612
614 .init(0, 1600000-1, 10000)
616
618 .init(0, 1600000-1, 10000)
620
622 .init(0, 1600000-1, 10000)
624
626 .init(0, 1600000-1, 10000)
628
630 .init(0, 1600000-1, 10000)
632
634 .init(0, 1600000-1, 10000)
636
638 .init(0, 20, 1)
640
643
645 for (int idx = 0; idx < wf_size; ++idx) {
646 std::stringstream namestr;
647 ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
648 static_cast<Shader*>(parent)->name(), idx);
650 .init(0, 1600000-1, 10000)
651 .name(namestr.str())
652 .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
654 }
655}
656
657} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
Declaration and inline definition of ChunkGenerator object.
const char data[]
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition base.hh:288
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
virtual std::string name() const
Definition named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
void setSuppressFuncError()
Definition packet.hh:757
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition process.hh:146
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition process.cc:317
std::shared_ptr< MemState > memState
Definition process.hh:289
bool kernelExitRequested
Definition shader.hh:102
Addr mmap(int length)
Definition shader.cc:117
void prepareInvalidate(HSAQueueEntry *task)
Definition shader.cc:203
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition shader.cc:395
bool blitKernel
Definition shader.hh:105
void notifyCuSleep()
Definition shader.cc:536
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition shader.cc:303
void execScheduledAdds()
Definition shader.cc:173
EventFunctionWrapper tickEvent
Definition shader.hh:230
std::vector< ComputeUnit * > cuList
Definition shader.hh:268
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:571
int nextSchedCu
Definition shader.hh:255
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:376
GPUDispatcher & _dispatcher
Definition shader.hh:271
uint32_t sa_n
Definition shader.hh:258
ShaderParams Params
Definition shader.hh:113
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:507
std::vector< uint64_t > sa_when
Definition shader.hh:263
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition shader.cc:150
std::vector< int32_t > sa_x
Definition shader.hh:265
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:413
gem5::Shader::ShaderStats stats
std::vector< std::tuple< void *, uint32_t, Addr > > deferred_dispatches
Definition shader.hh:110
ThreadContext * gpuTc
Definition shader.hh:124
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition shader.cc:254
GPUDispatcher & dispatcher()
Definition shader.cc:111
Shader(const Params &p)
Definition shader.cc:57
void decNumOutstandingInvL2s()
Definition shader.cc:556
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:582
void updateContext(int cid)
Definition shader.cc:165
int n_cu_per_sqc
Definition shader.hh:249
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:427
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition shader.cc:241
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:487
void sampleLoad(const Tick accessTime)
Definition shader.cc:480
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition shader.cc:446
int num_outstanding_invl2s
Definition shader.hh:109
void sampleStore(const Tick accessTime)
Definition shader.cc:470
BaseCPU * cpuPointer
Definition shader.hh:125
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:270
Tick _lastInactiveTick
Definition shader.hh:98
std::vector< int * > sa_val
Definition shader.hh:261
int _activeCus
Definition shader.hh:95
virtual Process * getProcessPtr()=0
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
A simple distribution stat.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
STL vector class.
Definition stl.hh:37
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:260
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
void reschedule(Event &event, Tick when, bool always=false)
Definition eventq.hh:1030
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
Tick when() const
Get the time that the event is scheduled.
Definition eventq.hh:501
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Bitfield< 4 > t4
Bitfield< 2 > t2
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 3 > t3
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 5 > t5
Bitfield< 1 > t1
Bitfield< 0 > p
Bitfield< 3 > x
Definition pagetable.hh:73
Bitfield< 63 > val
Definition misc.hh:804
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ InstMemoryHopMax
Definition misc.hh:58
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
void ccprintf(cp::Print &print)
Definition cprintf.hh:130
@ HW_REG_SH_MEM_BASES
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
Definition shader.hh:375
statistics::Distribution storeLatencyDist
Definition shader.hh:353
statistics::Distribution initToCoalesceLatency
Definition shader.hh:356
statistics::Scalar shaderActiveTicks
Definition shader.hh:374
statistics::Distribution loadLatencyDist
Definition shader.hh:352
statistics::Distribution allLatencyDist
Definition shader.hh:351
statistics::Distribution gmToCompleteLatency
Definition shader.hh:365
ShaderStats(statistics::Group *parent, int wf_size)
Definition shader.cc:587
statistics::Distribution coalsrLineAddresses
Definition shader.hh:368
statistics::Vector vectorInstDstOperand
Definition shader.hh:376
statistics::Distribution rubyNetworkLatency
Definition shader.hh:359
statistics::Distribution * cacheBlockRoundTrip
Definition shader.hh:372
statistics::Distribution gmEnqueueLatency
Definition shader.hh:362

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0