gem5  v22.1.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "gpu-compute/shader.hh"
33 
34 #include <limits>
35 
38 #include "base/chunk_generator.hh"
39 #include "debug/GPUAgentDisp.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUMem.hh"
42 #include "debug/GPUShader.hh"
43 #include "debug/GPUWgLatency.hh"
48 #include "gpu-compute/wavefront.hh"
49 #include "mem/packet.hh"
51 #include "sim/sim_exit.hh"
52 
53 namespace gem5
54 {
55 
57  _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
58  gpuTc(nullptr), cpuPointer(p.cpu_pointer),
59  tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
60  false, Event::CPU_Tick_Pri),
61  timingSim(p.timing), hsail_mode(SIMT),
62  impl_kern_launch_acq(p.impl_kern_launch_acq),
63  impl_kern_end_rel(p.impl_kern_end_rel),
64  coissue_return(1),
65  trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
66  globalMemSize(p.globalmem),
67  nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
68  _dispatcher(*p.dispatcher), systemHub(p.system_hub),
69  max_valu_insts(p.max_valu_insts), total_valu_insts(0),
70  stats(this, p.CUs[0]->wfSize())
71 {
72  gpuCmdProc.setShader(this);
73  _dispatcher.setShader(this);
74 
75  _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
76  _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
77 
78  _ldsApe.base = ((Addr)1 << 61) + 0x0;
79  _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
80 
81  _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
82  _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
83 
84  shHiddenPrivateBaseVmid = 0;
85 
86  cuList.resize(n_cu);
87 
88  panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
89 
90  for (int i = 0; i < n_cu; ++i) {
91  cuList[i] = p.CUs[i];
92  assert(i == cuList[i]->cu_id);
93  cuList[i]->shader = this;
94  cuList[i]->idleCUTimeout = p.idlecu_timeout;
95  }
96 }
97 
98 GPUDispatcher&
100 {
101  return _dispatcher;
102 }
103 
104 Addr
105 Shader::mmap(int length)
106 {
107 
108  Addr start;
109 
110  // round up length to the next page
111  length = roundUp(length, X86ISA::PageBytes);
112 
113  Process *proc = gpuTc->getProcessPtr();
114  auto mem_state = proc->memState;
115 
116  if (proc->mmapGrowsDown()) {
117  DPRINTF(GPUShader, "GROWS DOWN");
118  start = mem_state->getMmapEnd() - length;
119  mem_state->setMmapEnd(start);
120  } else {
121  DPRINTF(GPUShader, "GROWS UP");
122  start = mem_state->getMmapEnd();
123  mem_state->setMmapEnd(start + length);
124 
125  // assertion to make sure we don't overwrite the stack (it grows down)
126  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
127  mem_state->getMmapEnd());
128  }
129 
130  DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
131 
132  proc->allocateMem(start, length);
133 
134  return start;
135 }
136 
137 void
139 {
140  // grab the threadContext of the thread running on the CPU
141  assert(cpuPointer);
143  assert(gpuTc);
144 }
145 
147 {
148  for (int j = 0; j < n_cu; ++j)
149  delete cuList[j];
150 }
151 
152 void
154  // context of the thread which dispatched work
155  assert(cpuPointer);
156  gpuTc = cpuPointer->getContext(cid);
157  assert(gpuTc);
158 }
159 
160 void
162 {
163  assert(!sa_when.empty());
164 
165  // apply any scheduled adds
166  for (int i = 0; i < sa_n; ++i) {
167  if (sa_when[i] <= curTick()) {
168  *sa_val[i] += sa_x[i];
169  panic_if(*sa_val[i] < 0, "Negative counter value\n");
170  sa_val.erase(sa_val.begin() + i);
171  sa_x.erase(sa_x.begin() + i);
172  sa_when.erase(sa_when.begin() + i);
173  --sa_n;
174  --i;
175  }
176  }
177  if (!sa_when.empty()) {
178  Tick shader_wakeup = *std::max_element(sa_when.begin(),
179  sa_when.end());
180  DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
181  schedule(tickEvent, shader_wakeup);
182  } else {
183  DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
184  }
185 }
186 
187 /*
188  * dispatcher/shader arranges invalidate requests to the CUs
189  */
190 void
192  // if invalidate has already started/finished, then do nothing
193  if (task->isInvStarted()) return;
194 
195  // invalidate has never started; it can only perform once at kernel launch
196  assert(task->outstandingInvs() == -1);
197  int kernId = task->dispatchId();
198  // counter value is 0 now, indicating the inv is about to start
199  _dispatcher.updateInvCounter(kernId, +1);
200 
201  // iterate all cus managed by the shader, to perform invalidate.
202  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
203  // create a request to hold INV info; the request's fields will
204  // be updated in cu before use
205  auto req = std::make_shared<Request>(0, 0, 0,
206  cuList[i_cu]->requestorId(),
207  0, -1);
208 
209  _dispatcher.updateInvCounter(kernId, +1);
210  // all necessary INV flags are all set now, call cu to execute
211  cuList[i_cu]->doInvalidate(req, task->dispatchId());
212 
213  // I don't like this. This is intrusive coding.
214  cuList[i_cu]->resetRegisterPool();
215  }
216 }
217 
221 void
223  int kernId = gpuDynInst->kern_id;
224  // flush has never been started, performed only once at kernel end
225  assert(_dispatcher.getOutstandingWbs(kernId) == 0);
226 
227  // the first cu, managed by the shader, performs flush operation,
228  // assuming that L2 cache is shared by all cus in the shader
229  int i_cu = 0;
230  _dispatcher.updateWbCounter(kernId, +1);
231  cuList[i_cu]->doFlush(gpuDynInst);
232 }
233 
234 bool
236 {
237  bool scheduledSomething = false;
238  int cuCount = 0;
239  int curCu = nextSchedCu;
240  int disp_count(0);
241 
242  while (cuCount < n_cu) {
243  //Every time we try a CU, update nextSchedCu
244  nextSchedCu = (nextSchedCu + 1) % n_cu;
245 
246  // dispatch workgroup iff the following two conditions are met:
247  // (a) wg_rem is true - there are unassigned workgroups in the grid
248  // (b) there are enough free slots in cu cuList[i] for this wg
249  int num_wfs_in_wg = 0;
250  bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
251  if (!task->dispComplete() && can_disp) {
252  scheduledSomething = true;
253  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
254  curCu, task->globalWgId());
255  DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
256  curCu, task->globalWgId());
257  DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
258  curTick(), task->globalWgId(), curCu);
259 
260  if (!cuList[curCu]->tickEvent.scheduled()) {
261  if (!_activeCus)
263  _activeCus++;
264  }
265 
266  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
267  "Invalid activeCu size\n");
268  cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
269 
270  task->markWgDispatch();
271  ++disp_count;
272  }
273 
274  ++cuCount;
275  curCu = nextSchedCu;
276  }
277 
278  DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
279 
280  return scheduledSomething;
281 }
282 
283 void
285  bool suppress_func_errors, int cu_id)
286 {
287  int block_size = cuList.at(cu_id)->cacheLineSize();
288  unsigned size = req->getSize();
289 
290  Addr tmp_addr;
291  BaseMMU::Mode trans_mode;
292 
293  if (cmd == MemCmd::ReadReq) {
294  trans_mode = BaseMMU::Read;
295  } else if (cmd == MemCmd::WriteReq) {
296  trans_mode = BaseMMU::Write;
297  } else {
298  fatal("unexcepted MemCmd\n");
299  }
300 
301  tmp_addr = req->getVaddr();
302  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
303 
304  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
305 
306  // Misaligned access
307  if (split_addr > tmp_addr) {
308  RequestPtr req1, req2;
309  req->splitOnVaddr(split_addr, req1, req2);
310 
311  PacketPtr pkt1 = new Packet(req2, cmd);
312  PacketPtr pkt2 = new Packet(req1, cmd);
313 
314  functionalTLBAccess(pkt1, cu_id, trans_mode);
315  functionalTLBAccess(pkt2, cu_id, trans_mode);
316 
317  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
318  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
319 
320  new_pkt1->dataStatic(data);
321  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
322 
323  if (suppress_func_errors) {
324  new_pkt1->setSuppressFuncError();
325  new_pkt2->setSuppressFuncError();
326  }
327 
328  // fixme: this should be cuList[cu_id] if cu_id != n_cu
329  // The latter requires a memPort in the dispatcher
330  cuList[0]->memPort[0].sendFunctional(new_pkt1);
331  cuList[0]->memPort[0].sendFunctional(new_pkt2);
332 
333  delete new_pkt1;
334  delete new_pkt2;
335  delete pkt1;
336  delete pkt2;
337  } else {
338  PacketPtr pkt = new Packet(req, cmd);
339  functionalTLBAccess(pkt, cu_id, trans_mode);
340  PacketPtr new_pkt = new Packet(pkt->req, cmd);
341  new_pkt->dataStatic(data);
342 
343  if (suppress_func_errors) {
344  new_pkt->setSuppressFuncError();
345  };
346 
347  // fixme: this should be cuList[cu_id] if cu_id != n_cu
348  // The latter requires a memPort in the dispatcher
349  cuList[0]->memPort[0].sendFunctional(new_pkt);
350 
351  delete new_pkt;
352  delete pkt;
353  }
354 }
355 
356 void
358 {
359  sa_val.push_back(val);
360  when += curTick();
361  sa_when.push_back(when);
362  sa_x.push_back(x);
363  ++sa_n;
364  if (!tickEvent.scheduled() || (when < tickEvent.when())) {
365  DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
366  "%lu\n", when);
367  reschedule(tickEvent, when, true);
368  } else {
369  assert(tickEvent.scheduled());
370  DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
371  "%lu\n", when);
372  }
373 }
374 
375 void
376 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
377  MemCmd cmd, bool suppress_func_errors)
378 {
379  uint8_t *data_buf = (uint8_t*)ptr;
380 
381  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
382  !gen.done(); gen.next()) {
383 
384  RequestPtr req = std::make_shared<Request>(
385  gen.addr(), gen.size(), 0,
386  cuList[0]->requestorId(), 0, 0, nullptr);
387 
388  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
389  data_buf += gen.size();
390  }
391 }
392 
393 void
394 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
395 {
396  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
397 }
398 
399 void
400 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
401  bool suppress_func_errors)
402 {
403  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
404  suppress_func_errors);
405 }
406 
407 void
408 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
409 {
410  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
411 }
412 
413 void
414 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
415  bool suppress_func_errors)
416 {
417  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
418  suppress_func_errors);
419 }
420 
421 /*
422  * Send a packet through the appropriate TLB functional port.
423  * If cu_id=n_cu, then this is the dispatcher's TLB.
424  * Otherwise it's the TLB of the cu_id compute unit.
425  */
426 void
428 {
429  // update senderState. Need to know the gpuTc and the TLB mode
430  pkt->senderState =
431  new GpuTranslationState(mode, gpuTc, false);
432 
433  // even when the perLaneTLB flag is turned on
434  // it's ok tp send all accesses through lane 0
435  // since the lane # is not known here,
436  // This isn't important since these are functional accesses.
437  cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
438 
439  /* safe_cast the senderState */
440  GpuTranslationState *sender_state =
441  safe_cast<GpuTranslationState*>(pkt->senderState);
442 
443  delete sender_state->tlbEntry;
444  delete pkt->senderState;
445 }
446 
447 /*
448  * allow the shader to sample stats from constituent devices
449  */
450 void
451 Shader::sampleStore(const Tick accessTime)
452 {
453  stats.storeLatencyDist.sample(accessTime);
454  stats.allLatencyDist.sample(accessTime);
455 }
456 
457 /*
458  * allow the shader to sample stats from constituent devices
459  */
460 void
461 Shader::sampleLoad(const Tick accessTime)
462 {
463  stats.loadLatencyDist.sample(accessTime);
464  stats.allLatencyDist.sample(accessTime);
465 }
466 
467 void
469 {
470  // Only sample instructions that go all the way to main memory
471  if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
472  return;
473  }
474 
475  Tick t1 = roundTripTime[0];
476  Tick t2 = roundTripTime[1];
477  Tick t3 = roundTripTime[2];
478  Tick t4 = roundTripTime[3];
479  Tick t5 = roundTripTime[4];
480 
485 }
486 
487 void
489 {
490  stats.coalsrLineAddresses.sample(lineMap.size());
491  std::vector<Tick> netTimes;
492 
493  // For each cache block address generated by a vmem inst, calculate
494  // the round-trip time for that cache block.
495  for (auto& it : lineMap) {
496  const std::vector<Tick>& timeVec = it.second;
497  if (timeVec.size() == 2) {
498  netTimes.push_back(timeVec[1] - timeVec[0]);
499  }
500  }
501 
502  // Sort the cache block round trip times so that the first
503  // distrubtion is always measuring the fastests and the last
504  // distrubtion is always measuring the slowest cache block.
505  std::sort(netTimes.begin(), netTimes.end());
506 
507  // Sample the round trip time for each N cache blocks into the
508  // Nth distribution.
509  int idx = 0;
510  for (auto& time : netTimes) {
511  stats.cacheBlockRoundTrip[idx].sample(time);
512  ++idx;
513  }
514 }
515 
516 void
518  // If all CUs attached to his shader are asleep, update shaderActiveTicks
519  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
520  "Invalid activeCu size\n");
521  _activeCus--;
522  if (!_activeCus)
524 }
525 
531 {
532  return gpuCmdProc.vramRequestorId();
533 }
534 
536  : statistics::Group(parent),
537  ADD_STAT(allLatencyDist, "delay distribution for all"),
538  ADD_STAT(loadLatencyDist, "delay distribution for loads"),
539  ADD_STAT(storeLatencyDist, "delay distribution for stores"),
540  ADD_STAT(initToCoalesceLatency,
541  "Ticks from vmem inst initiateAcc to coalescer issue"),
542  ADD_STAT(rubyNetworkLatency,
543  "Ticks from coalescer issue to coalescer hit callback"),
544  ADD_STAT(gmEnqueueLatency,
545  "Ticks from coalescer hit callback to GM pipe enqueue"),
546  ADD_STAT(gmToCompleteLatency,
547  "Ticks queued in GM pipes ordered response buffer"),
548  ADD_STAT(coalsrLineAddresses,
549  "Number of cache lines for coalesced request"),
550  ADD_STAT(shaderActiveTicks,
551  "Total ticks that any CU attached to this shader is active"),
552  ADD_STAT(vectorInstSrcOperand,
553  "vector instruction source operand distribution"),
554  ADD_STAT(vectorInstDstOperand,
555  "vector instruction destination operand distribution")
556 {
558  .init(0, 1600000, 10000)
560 
562  .init(0, 1600000, 10000)
564 
566  .init(0, 1600000, 10000)
568 
570  .init(0, 1600000, 10000)
572 
574  .init(0, 1600000, 10000)
576 
578  .init(0, 1600000, 10000)
580 
582  .init(0, 1600000, 10000)
584 
586  .init(0, 20, 1)
588 
591 
593  for (int idx = 0; idx < wf_size; ++idx) {
594  std::stringstream namestr;
595  ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
596  static_cast<Shader*>(parent)->name(), idx);
598  .init(0, 1600000, 10000)
599  .name(namestr.str())
600  .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
602  }
603 }
604 
605 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
Declaration and inline definition of ChunkGenerator object.
const char data[]
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition: base.hh:284
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:246
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
Definition: dispatcher.cc:280
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:266
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
bool dispComplete() const
virtual std::string name() const
Definition: named.hh:47
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1162
SenderState * senderState
This packet's sender state.
Definition: packet.hh:544
RequestPtr req
A pointer to the original request.
Definition: packet.hh:376
void setSuppressFuncError()
Definition: packet.hh:755
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition: process.hh:147
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:317
std::shared_ptr< MemState > memState
Definition: process.hh:290
Addr mmap(int length)
Definition: shader.cc:105
void prepareInvalidate(HSAQueueEntry *task)
Definition: shader.cc:191
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:376
void notifyCuSleep()
Definition: shader.cc:517
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:284
void execScheduledAdds()
Definition: shader.cc:161
EventFunctionWrapper tickEvent
Definition: shader.hh:218
std::vector< ComputeUnit * > cuList
Definition: shader.hh:254
int nextSchedCu
Definition: shader.hh:241
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:357
GPUDispatcher & _dispatcher
Definition: shader.hh:257
uint32_t sa_n
Definition: shader.hh:244
ShaderParams Params
Definition: shader.hh:101
std::vector< uint64_t > sa_when
Definition: shader.hh:249
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:138
std::vector< int32_t > sa_x
Definition: shader.hh:251
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:394
gem5::Shader::ShaderStats stats
ThreadContext * gpuTc
Definition: shader.hh:112
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:488
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition: shader.cc:235
GPUDispatcher & dispatcher()
Definition: shader.cc:99
Shader(const Params &p)
Definition: shader.cc:56
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition: shader.cc:530
void updateContext(int cid)
Definition: shader.cc:153
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:408
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition: shader.cc:222
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:468
void sampleLoad(const Tick accessTime)
Definition: shader.cc:461
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition: shader.cc:427
void sampleStore(const Tick accessTime)
Definition: shader.cc:451
BaseCPU * cpuPointer
Definition: shader.hh:113
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:256
Tick _lastInactiveTick
Definition: shader.hh:98
std::vector< int * > sa_val
Definition: shader.hh:247
int _activeCus
Definition: shader.hh:95
virtual Process * getProcessPtr()=0
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:358
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:289
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
A simple distribution stat.
Definition: statistics.hh:2085
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
Statistics container.
Definition: group.hh:94
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:260
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
void reschedule(Event &event, Tick when, bool always=false)
Definition: eventq.hh:1037
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:508
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
HSAQueuEntry is the simulator's internal representation of an AQL queue entry (task).
Bitfield< 4 > t4
Definition: misc_types.hh:231
Bitfield< 2 > t2
Definition: misc_types.hh:233
Bitfield< 4, 0 > mode
Definition: misc_types.hh:74
Bitfield< 3 > t3
Definition: misc_types.hh:232
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 5 > t5
Definition: misc_types.hh:230
Bitfield< 1 > t1
Definition: misc_types.hh:234
Bitfield< 24 > j
Definition: misc_types.hh:57
Bitfield< 4 > x
Definition: pagetable.hh:61
Bitfield< 54 > p
Definition: pagetable.hh:70
Bitfield< 63 > val
Definition: misc.hh:776
const Addr PageBytes
Definition: page_size.hh:49
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
@ InstMemoryHopMax
Definition: misc.hh:58
uint64_t Tick
Tick count type.
Definition: types.hh:58
uint16_t RequestorID
Definition: request.hh:95
void ccprintf(cp::Print &print)
Definition: cprintf.hh:130
Declaration of the Packet class.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
statistics::Vector vectorInstSrcOperand
Definition: shader.hh:347
statistics::Distribution storeLatencyDist
Definition: shader.hh:325
statistics::Distribution initToCoalesceLatency
Definition: shader.hh:328
statistics::Scalar shaderActiveTicks
Definition: shader.hh:346
statistics::Distribution loadLatencyDist
Definition: shader.hh:324
statistics::Distribution allLatencyDist
Definition: shader.hh:323
statistics::Distribution gmToCompleteLatency
Definition: shader.hh:337
ShaderStats(statistics::Group *parent, int wf_size)
Definition: shader.cc:535
statistics::Distribution coalsrLineAddresses
Definition: shader.hh:340
statistics::Vector vectorInstDstOperand
Definition: shader.hh:348
statistics::Distribution rubyNetworkLatency
Definition: shader.hh:331
statistics::Distribution * cacheBlockRoundTrip
Definition: shader.hh:344
statistics::Distribution gmEnqueueLatency
Definition: shader.hh:334

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1