gem5 v23.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
shader.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "gpu-compute/shader.hh"
33
34#include <limits>
35
39#include "debug/GPUAgentDisp.hh"
40#include "debug/GPUDisp.hh"
41#include "debug/GPUMem.hh"
42#include "debug/GPUShader.hh"
43#include "debug/GPUWgLatency.hh"
49#include "mem/packet.hh"
51#include "sim/sim_exit.hh"
52
53namespace gem5
54{
55
57 _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
58 gpuTc(nullptr), cpuPointer(p.cpu_pointer),
59 tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
60 false, Event::CPU_Tick_Pri),
61 timingSim(p.timing), hsail_mode(SIMT),
62 impl_kern_launch_acq(p.impl_kern_launch_acq),
63 impl_kern_end_rel(p.impl_kern_end_rel),
64 coissue_return(1),
65 trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
66 globalMemSize(p.globalmem),
67 nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
68 _dispatcher(*p.dispatcher), systemHub(p.system_hub),
69 max_valu_insts(p.max_valu_insts), total_valu_insts(0),
70 stats(this, p.CUs[0]->wfSize())
71{
72 gpuCmdProc.setShader(this);
73 _dispatcher.setShader(this);
74
75 _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
76 _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
77
78 _ldsApe.base = ((Addr)1 << 61) + 0x0;
79 _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
80
81 _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
82 _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
83
84 shHiddenPrivateBaseVmid = 0;
85
86 cuList.resize(n_cu);
87
88 panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
89
90 for (int i = 0; i < n_cu; ++i) {
91 cuList[i] = p.CUs[i];
92 assert(i == cuList[i]->cu_id);
93 cuList[i]->shader = this;
94 cuList[i]->idleCUTimeout = p.idlecu_timeout;
95 }
96}
97
98GPUDispatcher&
100{
101 return _dispatcher;
102}
103
104Addr
105Shader::mmap(int length)
106{
107
108 Addr start;
109
110 // round up length to the next page
111 length = roundUp(length, X86ISA::PageBytes);
112
113 Process *proc = gpuTc->getProcessPtr();
114 auto mem_state = proc->memState;
115
116 if (proc->mmapGrowsDown()) {
117 DPRINTF(GPUShader, "GROWS DOWN");
118 start = mem_state->getMmapEnd() - length;
119 mem_state->setMmapEnd(start);
120 } else {
121 DPRINTF(GPUShader, "GROWS UP");
122 start = mem_state->getMmapEnd();
123 mem_state->setMmapEnd(start + length);
124
125 // assertion to make sure we don't overwrite the stack (it grows down)
126 assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
127 mem_state->getMmapEnd());
128 }
129
130 DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
131
132 proc->allocateMem(start, length);
133
134 return start;
135}
136
137void
139{
140 // grab the threadContext of the thread running on the CPU
141 assert(cpuPointer);
143 assert(gpuTc);
144}
145
147{
148 for (int j = 0; j < n_cu; ++j)
149 delete cuList[j];
150}
151
152void
154 // context of the thread which dispatched work
155 assert(cpuPointer);
157 assert(gpuTc);
158}
159
160void
162{
163 assert(!sa_when.empty());
164
165 // apply any scheduled adds
166 for (int i = 0; i < sa_n; ++i) {
167 if (sa_when[i] <= curTick()) {
168 *sa_val[i] += sa_x[i];
169 panic_if(*sa_val[i] < 0, "Negative counter value\n");
170 sa_val.erase(sa_val.begin() + i);
171 sa_x.erase(sa_x.begin() + i);
172 sa_when.erase(sa_when.begin() + i);
173 --sa_n;
174 --i;
175 }
176 }
177 if (!sa_when.empty()) {
178 Tick shader_wakeup = *std::max_element(sa_when.begin(),
179 sa_when.end());
180 DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
181 schedule(tickEvent, shader_wakeup);
182 } else {
183 DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
184 }
185}
186
187/*
188 * dispatcher/shader arranges invalidate requests to the CUs
189 */
190void
192 // if invalidate has already started/finished, then do nothing
193 if (task->isInvStarted()) return;
194
195 // invalidate has never started; it can only perform once at kernel launch
196 assert(task->outstandingInvs() == -1);
197 int kernId = task->dispatchId();
198 // counter value is 0 now, indicating the inv is about to start
199 _dispatcher.updateInvCounter(kernId, +1);
200
201 // iterate all cus managed by the shader, to perform invalidate.
202 for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
203 // create a request to hold INV info; the request's fields will
204 // be updated in cu before use
205 auto req = std::make_shared<Request>(0, 0, 0,
206 cuList[i_cu]->requestorId(),
207 0, -1);
208
209 _dispatcher.updateInvCounter(kernId, +1);
210 // all necessary INV flags are all set now, call cu to execute
211 cuList[i_cu]->doInvalidate(req, task->dispatchId());
212
213 // I don't like this. This is intrusive coding.
214 cuList[i_cu]->resetRegisterPool();
215 }
216}
217
221void
223 int kernId = gpuDynInst->kern_id;
224 // flush has never been started, performed only once at kernel end
225 assert(_dispatcher.getOutstandingWbs(kernId) == 0);
226
227 // the first cu, managed by the shader, performs flush operation,
228 // assuming that L2 cache is shared by all cus in the shader
229 int i_cu = 0;
230 _dispatcher.updateWbCounter(kernId, +1);
231 cuList[i_cu]->doFlush(gpuDynInst);
232}
233
234bool
236{
237 bool scheduledSomething = false;
238 int cuCount = 0;
239 int curCu = nextSchedCu;
240 int disp_count(0);
241
242 while (cuCount < n_cu) {
243 //Every time we try a CU, update nextSchedCu
244 nextSchedCu = (nextSchedCu + 1) % n_cu;
245
246 // dispatch workgroup iff the following two conditions are met:
247 // (a) wg_rem is true - there are unassigned workgroups in the grid
248 // (b) there are enough free slots in cu cuList[i] for this wg
249 int num_wfs_in_wg = 0;
250 bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
251 if (!task->dispComplete() && can_disp) {
252 scheduledSomething = true;
253 DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
254 curCu, task->globalWgId());
255 DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
256 curCu, task->globalWgId());
257 DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
258 curTick(), task->globalWgId(), curCu);
259
260 if (!cuList[curCu]->tickEvent.scheduled()) {
261 if (!_activeCus)
263 _activeCus++;
264 }
265
266 panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
267 "Invalid activeCu size\n");
268 cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
269
270 task->markWgDispatch();
271 ++disp_count;
272 }
273
274 ++cuCount;
275 curCu = nextSchedCu;
276 }
277
278 DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
279
280 return scheduledSomething;
281}
282
283void
285 bool suppress_func_errors, int cu_id)
286{
287 int block_size = cuList.at(cu_id)->cacheLineSize();
288 unsigned size = req->getSize();
289
290 Addr tmp_addr;
291 BaseMMU::Mode trans_mode;
292
293 if (cmd == MemCmd::ReadReq) {
294 trans_mode = BaseMMU::Read;
295 } else if (cmd == MemCmd::WriteReq) {
296 trans_mode = BaseMMU::Write;
297 } else {
298 fatal("unexcepted MemCmd\n");
299 }
300
301 tmp_addr = req->getVaddr();
302 Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
303
304 assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
305
306 // Misaligned access
307 if (split_addr > tmp_addr) {
308 RequestPtr req1, req2;
309 req->splitOnVaddr(split_addr, req1, req2);
310
311 PacketPtr pkt1 = new Packet(req2, cmd);
312 PacketPtr pkt2 = new Packet(req1, cmd);
313
314 functionalTLBAccess(pkt1, cu_id, trans_mode);
315 functionalTLBAccess(pkt2, cu_id, trans_mode);
316
317 PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
318 PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
319
320 new_pkt1->dataStatic(data);
321 new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
322
323 if (suppress_func_errors) {
324 new_pkt1->setSuppressFuncError();
325 new_pkt2->setSuppressFuncError();
326 }
327
328 // fixme: this should be cuList[cu_id] if cu_id != n_cu
329 // The latter requires a memPort in the dispatcher
330 cuList[0]->memPort[0].sendFunctional(new_pkt1);
331 cuList[0]->memPort[0].sendFunctional(new_pkt2);
332
333 delete new_pkt1;
334 delete new_pkt2;
335 delete pkt1;
336 delete pkt2;
337 } else {
338 PacketPtr pkt = new Packet(req, cmd);
339 functionalTLBAccess(pkt, cu_id, trans_mode);
340 PacketPtr new_pkt = new Packet(pkt->req, cmd);
341 new_pkt->dataStatic(data);
342
343 if (suppress_func_errors) {
344 new_pkt->setSuppressFuncError();
345 };
346
347 // fixme: this should be cuList[cu_id] if cu_id != n_cu
348 // The latter requires a memPort in the dispatcher
349 cuList[0]->memPort[0].sendFunctional(new_pkt);
350
351 delete new_pkt;
352 delete pkt;
353 }
354}
355
356void
358{
359 sa_val.push_back(val);
360 when += curTick();
361 sa_when.push_back(when);
362 sa_x.push_back(x);
363 ++sa_n;
364 if (!tickEvent.scheduled() || (when < tickEvent.when())) {
365 DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
366 "%lu\n", when);
367 reschedule(tickEvent, when, true);
368 } else {
369 assert(tickEvent.scheduled());
370 DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
371 "%lu\n", when);
372 }
373}
374
375void
376Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
377 MemCmd cmd, bool suppress_func_errors)
378{
379 uint8_t *data_buf = (uint8_t*)ptr;
380
381 for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
382 !gen.done(); gen.next()) {
383
384 RequestPtr req = std::make_shared<Request>(
385 gen.addr(), gen.size(), 0,
386 cuList[0]->requestorId(), 0, 0, nullptr);
387
388 doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
389 data_buf += gen.size();
390 }
391}
392
393void
394Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
395{
396 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
397}
398
399void
400Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
401 bool suppress_func_errors)
402{
403 AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
404 suppress_func_errors);
405}
406
407void
408Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
409{
410 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
411}
412
413void
414Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
415 bool suppress_func_errors)
416{
417 AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
418 suppress_func_errors);
419}
420
421/*
422 * Send a packet through the appropriate TLB functional port.
423 * If cu_id=n_cu, then this is the dispatcher's TLB.
424 * Otherwise it's the TLB of the cu_id compute unit.
425 */
426void
428{
429 // update senderState. Need to know the gpuTc and the TLB mode
430 pkt->senderState =
431 new GpuTranslationState(mode, gpuTc, false);
432
433 // even when the perLaneTLB flag is turned on
434 // it's ok tp send all accesses through lane 0
435 // since the lane # is not known here,
436 // This isn't important since these are functional accesses.
437 cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
438
439 /* safe_cast the senderState */
440 GpuTranslationState *sender_state =
441 safe_cast<GpuTranslationState*>(pkt->senderState);
442
443 delete sender_state->tlbEntry;
444 delete pkt->senderState;
445}
446
447/*
448 * allow the shader to sample stats from constituent devices
449 */
450void
451Shader::sampleStore(const Tick accessTime)
452{
453 stats.storeLatencyDist.sample(accessTime);
454 stats.allLatencyDist.sample(accessTime);
455}
456
457/*
458 * allow the shader to sample stats from constituent devices
459 */
460void
461Shader::sampleLoad(const Tick accessTime)
462{
463 stats.loadLatencyDist.sample(accessTime);
464 stats.allLatencyDist.sample(accessTime);
465}
466
467void
469{
470 // Only sample instructions that go all the way to main memory
471 if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
472 return;
473 }
474
475 Tick t1 = roundTripTime[0];
476 Tick t2 = roundTripTime[1];
477 Tick t3 = roundTripTime[2];
478 Tick t4 = roundTripTime[3];
479 Tick t5 = roundTripTime[4];
480
485}
486
487void
489{
490 stats.coalsrLineAddresses.sample(lineMap.size());
491 std::vector<Tick> netTimes;
492
493 // For each cache block address generated by a vmem inst, calculate
494 // the round-trip time for that cache block.
495 for (auto& it : lineMap) {
496 const std::vector<Tick>& timeVec = it.second;
497 if (timeVec.size() == 2) {
498 netTimes.push_back(timeVec[1] - timeVec[0]);
499 }
500 }
501
502 // Sort the cache block round trip times so that the first
503 // distrubtion is always measuring the fastests and the last
504 // distrubtion is always measuring the slowest cache block.
505 std::sort(netTimes.begin(), netTimes.end());
506
507 // Sample the round trip time for each N cache blocks into the
508 // Nth distribution.
509 int idx = 0;
510 for (auto& time : netTimes) {
512 ++idx;
513 }
514}
515
516void
518 // If all CUs attached to his shader are asleep, update shaderActiveTicks
519 panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
520 "Invalid activeCu size\n");
521 _activeCus--;
522 if (!_activeCus)
524}
525
531{
533}
534
536 : statistics::Group(parent),
537 ADD_STAT(allLatencyDist, "delay distribution for all"),
538 ADD_STAT(loadLatencyDist, "delay distribution for loads"),
539 ADD_STAT(storeLatencyDist, "delay distribution for stores"),
540 ADD_STAT(initToCoalesceLatency,
541 "Ticks from vmem inst initiateAcc to coalescer issue"),
542 ADD_STAT(rubyNetworkLatency,
543 "Ticks from coalescer issue to coalescer hit callback"),
544 ADD_STAT(gmEnqueueLatency,
545 "Ticks from coalescer hit callback to GM pipe enqueue"),
546 ADD_STAT(gmToCompleteLatency,
547 "Ticks queued in GM pipes ordered response buffer"),
548 ADD_STAT(coalsrLineAddresses,
549 "Number of cache lines for coalesced request"),
550 ADD_STAT(shaderActiveTicks,
551 "Total ticks that any CU attached to this shader is active"),
552 ADD_STAT(vectorInstSrcOperand,
553 "vector instruction source operand distribution"),
554 ADD_STAT(vectorInstDstOperand,
555 "vector instruction destination operand distribution")
556{
558 .init(0, 1600000, 10000)
560
562 .init(0, 1600000, 10000)
564
566 .init(0, 1600000, 10000)
568
570 .init(0, 1600000, 10000)
572
574 .init(0, 1600000, 10000)
576
578 .init(0, 1600000, 10000)
580
582 .init(0, 1600000, 10000)
584
586 .init(0, 20, 1)
588
591
593 for (int idx = 0; idx < wf_size; ++idx) {
594 std::stringstream namestr;
595 ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
596 static_cast<Shader*>(parent)->name(), idx);
598 .init(0, 1600000, 10000)
599 .name(namestr.str())
600 .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
602 }
603}
604
605} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
Declaration and inline definition of ChunkGenerator object.
const char data[]
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition base.hh:288
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
virtual std::string name() const
Definition named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
RequestPtr req
A pointer to the original request.
Definition packet.hh:377
void setSuppressFuncError()
Definition packet.hh:757
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition process.hh:146
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition process.cc:317
std::shared_ptr< MemState > memState
Definition process.hh:289
Addr mmap(int length)
Definition shader.cc:105
void prepareInvalidate(HSAQueueEntry *task)
Definition shader.cc:191
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition shader.cc:376
void notifyCuSleep()
Definition shader.cc:517
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition shader.cc:284
void execScheduledAdds()
Definition shader.cc:161
EventFunctionWrapper tickEvent
Definition shader.hh:218
std::vector< ComputeUnit * > cuList
Definition shader.hh:254
int nextSchedCu
Definition shader.hh:241
void ScheduleAdd(int *val, Tick when, int x)
Definition shader.cc:357
GPUDispatcher & _dispatcher
Definition shader.hh:257
uint32_t sa_n
Definition shader.hh:244
ShaderParams Params
Definition shader.hh:101
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick > > &roundTripTime)
Definition shader.cc:488
std::vector< uint64_t > sa_when
Definition shader.hh:249
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition shader.cc:138
std::vector< int32_t > sa_x
Definition shader.hh:251
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:394
gem5::Shader::ShaderStats stats
ThreadContext * gpuTc
Definition shader.hh:112
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition shader.cc:235
GPUDispatcher & dispatcher()
Definition shader.cc:99
Shader(const Params &p)
Definition shader.cc:56
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition shader.cc:530
void updateContext(int cid)
Definition shader.cc:153
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition shader.cc:408
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition shader.cc:222
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition shader.cc:468
void sampleLoad(const Tick accessTime)
Definition shader.cc:461
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition shader.cc:427
void sampleStore(const Tick accessTime)
Definition shader.cc:451
BaseCPU * cpuPointer
Definition shader.hh:113
GPUCommandProcessor & gpuCmdProc
Definition shader.hh:256
Tick _lastInactiveTick
Definition shader.hh:98
std::vector< int * > sa_val
Definition shader.hh:247
int _activeCus
Definition shader.hh:95
virtual Process * getProcessPtr()=0
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
A simple distribution stat.
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Statistics container.
Definition group.hh:93
Derived & init(size_type size)
Set this vector to have the given size.
STL vector class.
Definition stl.hh:37
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:260
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool scheduled() const
Determine if the current event is scheduled.
Definition eventq.hh:458
void schedule(Event &event, Tick when)
Definition eventq.hh:1012
void reschedule(Event &event, Tick when, bool always=false)
Definition eventq.hh:1030
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207
Tick when() const
Get the time that the event is scheduled.
Definition eventq.hh:501
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Bitfield< 4 > t4
Bitfield< 2 > t2
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 3 > t3
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 5 > t5
Bitfield< 1 > t1
Bitfield< 24 > j
Definition misc_types.hh:57
Bitfield< 0 > p
Bitfield< 3 > x
Definition pagetable.hh:73
Bitfield< 63 > val
Definition misc.hh:776
const Addr PageBytes
Definition page_size.hh:49
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition info.hh:61
const FlagsType oneline
Print all values on a single line.
Definition info.hh:71
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ InstMemoryHopMax
Definition misc.hh:58
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
void ccprintf(cp::Print &print)
Definition cprintf.hh:130
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
Definition shader.hh:347
statistics::Distribution storeLatencyDist
Definition shader.hh:325
statistics::Distribution initToCoalesceLatency
Definition shader.hh:328
statistics::Scalar shaderActiveTicks
Definition shader.hh:346
statistics::Distribution loadLatencyDist
Definition shader.hh:324
statistics::Distribution allLatencyDist
Definition shader.hh:323
statistics::Distribution gmToCompleteLatency
Definition shader.hh:337
ShaderStats(statistics::Group *parent, int wf_size)
Definition shader.cc:535
statistics::Distribution coalsrLineAddresses
Definition shader.hh:340
statistics::Vector vectorInstDstOperand
Definition shader.hh:348
statistics::Distribution rubyNetworkLatency
Definition shader.hh:331
statistics::Distribution * cacheBlockRoundTrip
Definition shader.hh:344
statistics::Distribution gmEnqueueLatency
Definition shader.hh:334

Generated on Mon Jul 10 2023 14:24:31 for gem5 by doxygen 1.9.7