gem5  v21.2.1.1
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "gpu-compute/shader.hh"
33 
34 #include <limits>
35 
37 #include "base/chunk_generator.hh"
38 #include "debug/GPUAgentDisp.hh"
39 #include "debug/GPUDisp.hh"
40 #include "debug/GPUMem.hh"
41 #include "debug/GPUShader.hh"
42 #include "debug/GPUWgLatency.hh"
47 #include "gpu-compute/wavefront.hh"
48 #include "mem/packet.hh"
50 #include "sim/sim_exit.hh"
51 
52 namespace gem5
53 {
54 
56  _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
57  gpuTc(nullptr), cpuPointer(p.cpu_pointer),
58  tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
59  false, Event::CPU_Tick_Pri),
60  timingSim(p.timing), hsail_mode(SIMT),
61  impl_kern_launch_acq(p.impl_kern_launch_acq),
62  impl_kern_end_rel(p.impl_kern_end_rel),
63  coissue_return(1),
64  trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
65  globalMemSize(p.globalmem),
66  nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
67  _dispatcher(*p.dispatcher),
68  max_valu_insts(p.max_valu_insts), total_valu_insts(0),
69  stats(this, p.CUs[0]->wfSize())
70 {
71  gpuCmdProc.setShader(this);
72  _dispatcher.setShader(this);
73 
74  _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
75  _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
76 
77  _ldsApe.base = ((Addr)1 << 61) + 0x0;
78  _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
79 
80  _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
81  _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
82 
83  shHiddenPrivateBaseVmid = 0;
84 
85  cuList.resize(n_cu);
86 
87  panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
88 
89  for (int i = 0; i < n_cu; ++i) {
90  cuList[i] = p.CUs[i];
91  assert(i == cuList[i]->cu_id);
92  cuList[i]->shader = this;
93  cuList[i]->idleCUTimeout = p.idlecu_timeout;
94  }
95 }
96 
97 GPUDispatcher&
99 {
100  return _dispatcher;
101 }
102 
103 Addr
104 Shader::mmap(int length)
105 {
106 
107  Addr start;
108 
109  // round up length to the next page
110  length = roundUp(length, X86ISA::PageBytes);
111 
112  Process *proc = gpuTc->getProcessPtr();
113  auto mem_state = proc->memState;
114 
115  if (proc->mmapGrowsDown()) {
116  DPRINTF(GPUShader, "GROWS DOWN");
117  start = mem_state->getMmapEnd() - length;
118  mem_state->setMmapEnd(start);
119  } else {
120  DPRINTF(GPUShader, "GROWS UP");
121  start = mem_state->getMmapEnd();
122  mem_state->setMmapEnd(start + length);
123 
124  // assertion to make sure we don't overwrite the stack (it grows down)
125  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
126  mem_state->getMmapEnd());
127  }
128 
129  DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
130 
131  proc->allocateMem(start, length);
132 
133  return start;
134 }
135 
136 void
138 {
139  // grab the threadContext of the thread running on the CPU
140  assert(cpuPointer);
141  gpuTc = cpuPointer->getContext(0);
142  assert(gpuTc);
143 }
144 
146 {
147  for (int j = 0; j < n_cu; ++j)
148  delete cuList[j];
149 }
150 
151 void
153  // context of the thread which dispatched work
154  assert(cpuPointer);
155  gpuTc = cpuPointer->getContext(cid);
156  assert(gpuTc);
157 }
158 
159 void
161 {
162  assert(!sa_when.empty());
163 
164  // apply any scheduled adds
165  for (int i = 0; i < sa_n; ++i) {
166  if (sa_when[i] <= curTick()) {
167  *sa_val[i] += sa_x[i];
168  panic_if(*sa_val[i] < 0, "Negative counter value\n");
169  sa_val.erase(sa_val.begin() + i);
170  sa_x.erase(sa_x.begin() + i);
171  sa_when.erase(sa_when.begin() + i);
172  --sa_n;
173  --i;
174  }
175  }
176  if (!sa_when.empty()) {
177  Tick shader_wakeup = *std::max_element(sa_when.begin(),
178  sa_when.end());
179  DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
180  schedule(tickEvent, shader_wakeup);
181  } else {
182  DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
183  }
184 }
185 
186 /*
187  * dispatcher/shader arranges invalidate requests to the CUs
188  */
189 void
191  // if invalidate has already started/finished, then do nothing
192  if (task->isInvStarted()) return;
193 
194  // invalidate has never started; it can only perform once at kernel launch
195  assert(task->outstandingInvs() == -1);
196  int kernId = task->dispatchId();
197  // counter value is 0 now, indicating the inv is about to start
198  _dispatcher.updateInvCounter(kernId, +1);
199 
200  // iterate all cus managed by the shader, to perform invalidate.
201  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
202  // create a request to hold INV info; the request's fields will
203  // be updated in cu before use
204  auto req = std::make_shared<Request>(0, 0, 0,
205  cuList[i_cu]->requestorId(),
206  0, -1);
207 
208  _dispatcher.updateInvCounter(kernId, +1);
209  // all necessary INV flags are all set now, call cu to execute
210  cuList[i_cu]->doInvalidate(req, task->dispatchId());
211 
212  // I don't like this. This is intrusive coding.
213  cuList[i_cu]->resetRegisterPool();
214  }
215 }
216 
220 void
222  int kernId = gpuDynInst->kern_id;
223  // flush has never been started, performed only once at kernel end
224  assert(_dispatcher.getOutstandingWbs(kernId) == 0);
225 
226  // the first cu, managed by the shader, performs flush operation,
227  // assuming that L2 cache is shared by all cus in the shader
228  int i_cu = 0;
229  _dispatcher.updateWbCounter(kernId, +1);
230  cuList[i_cu]->doFlush(gpuDynInst);
231 }
232 
233 bool
235 {
236  bool scheduledSomething = false;
237  int cuCount = 0;
238  int curCu = nextSchedCu;
239  int disp_count(0);
240 
241  while (cuCount < n_cu) {
242  //Every time we try a CU, update nextSchedCu
243  nextSchedCu = (nextSchedCu + 1) % n_cu;
244 
245  // dispatch workgroup iff the following two conditions are met:
246  // (a) wg_rem is true - there are unassigned workgroups in the grid
247  // (b) there are enough free slots in cu cuList[i] for this wg
248  int num_wfs_in_wg = 0;
249  bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
250  if (!task->dispComplete() && can_disp) {
251  scheduledSomething = true;
252  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
253  curCu, task->globalWgId());
254  DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
255  curCu, task->globalWgId());
256  DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
257  curTick(), task->globalWgId(), curCu);
258 
259  if (!cuList[curCu]->tickEvent.scheduled()) {
260  if (!_activeCus)
262  _activeCus++;
263  }
264 
265  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
266  "Invalid activeCu size\n");
267  cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
268 
269  task->markWgDispatch();
270  ++disp_count;
271  }
272 
273  ++cuCount;
274  curCu = nextSchedCu;
275  }
276 
277  DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
278 
279  return scheduledSomething;
280 }
281 
282 void
284  bool suppress_func_errors, int cu_id)
285 {
286  int block_size = cuList.at(cu_id)->cacheLineSize();
287  unsigned size = req->getSize();
288 
289  Addr tmp_addr;
290  BaseMMU::Mode trans_mode;
291 
292  if (cmd == MemCmd::ReadReq) {
293  trans_mode = BaseMMU::Read;
294  } else if (cmd == MemCmd::WriteReq) {
295  trans_mode = BaseMMU::Write;
296  } else {
297  fatal("unexcepted MemCmd\n");
298  }
299 
300  tmp_addr = req->getVaddr();
301  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
302 
303  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
304 
305  // Misaligned access
306  if (split_addr > tmp_addr) {
307  RequestPtr req1, req2;
308  req->splitOnVaddr(split_addr, req1, req2);
309 
310  PacketPtr pkt1 = new Packet(req2, cmd);
311  PacketPtr pkt2 = new Packet(req1, cmd);
312 
313  functionalTLBAccess(pkt1, cu_id, trans_mode);
314  functionalTLBAccess(pkt2, cu_id, trans_mode);
315 
316  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
317  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
318 
319  new_pkt1->dataStatic(data);
320  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
321 
322  if (suppress_func_errors) {
323  new_pkt1->setSuppressFuncError();
324  new_pkt2->setSuppressFuncError();
325  }
326 
327  // fixme: this should be cuList[cu_id] if cu_id != n_cu
328  // The latter requires a memPort in the dispatcher
329  cuList[0]->memPort[0].sendFunctional(new_pkt1);
330  cuList[0]->memPort[0].sendFunctional(new_pkt2);
331 
332  delete new_pkt1;
333  delete new_pkt2;
334  delete pkt1;
335  delete pkt2;
336  } else {
337  PacketPtr pkt = new Packet(req, cmd);
338  functionalTLBAccess(pkt, cu_id, trans_mode);
339  PacketPtr new_pkt = new Packet(pkt->req, cmd);
340  new_pkt->dataStatic(data);
341 
342  if (suppress_func_errors) {
343  new_pkt->setSuppressFuncError();
344  };
345 
346  // fixme: this should be cuList[cu_id] if cu_id != n_cu
347  // The latter requires a memPort in the dispatcher
348  cuList[0]->memPort[0].sendFunctional(new_pkt);
349 
350  delete new_pkt;
351  delete pkt;
352  }
353 }
354 
355 void
357 {
358  sa_val.push_back(val);
359  when += curTick();
360  sa_when.push_back(when);
361  sa_x.push_back(x);
362  ++sa_n;
363  if (!tickEvent.scheduled() || (when < tickEvent.when())) {
364  DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
365  "%lu\n", when);
366  reschedule(tickEvent, when, true);
367  } else {
368  assert(tickEvent.scheduled());
369  DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
370  "%lu\n", when);
371  }
372 }
373 
374 void
375 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
376  MemCmd cmd, bool suppress_func_errors)
377 {
378  uint8_t *data_buf = (uint8_t*)ptr;
379 
380  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
381  !gen.done(); gen.next()) {
382 
383  RequestPtr req = std::make_shared<Request>(
384  gen.addr(), gen.size(), 0,
385  cuList[0]->requestorId(), 0, 0, nullptr);
386 
387  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
388  data_buf += gen.size();
389  }
390 }
391 
392 void
393 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
394 {
395  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
396 }
397 
398 void
399 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
400  bool suppress_func_errors)
401 {
402  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
403  suppress_func_errors);
404 }
405 
406 void
407 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
408 {
409  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
410 }
411 
412 void
413 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
414  bool suppress_func_errors)
415 {
416  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
417  suppress_func_errors);
418 }
419 
420 /*
421  * Send a packet through the appropriate TLB functional port.
422  * If cu_id=n_cu, then this is the dispatcher's TLB.
423  * Otherwise it's the TLB of the cu_id compute unit.
424  */
425 void
427 {
428  // update senderState. Need to know the gpuTc and the TLB mode
429  pkt->senderState =
430  new GpuTranslationState(mode, gpuTc, false);
431 
432  // even when the perLaneTLB flag is turned on
433  // it's ok tp send all accesses through lane 0
434  // since the lane # is not known here,
435  // This isn't important since these are functional accesses.
436  cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
437 
438  /* safe_cast the senderState */
439  GpuTranslationState *sender_state =
440  safe_cast<GpuTranslationState*>(pkt->senderState);
441 
442  delete sender_state->tlbEntry;
443  delete pkt->senderState;
444 }
445 
446 /*
447  * allow the shader to sample stats from constituent devices
448  */
449 void
450 Shader::sampleStore(const Tick accessTime)
451 {
452  stats.storeLatencyDist.sample(accessTime);
453  stats.allLatencyDist.sample(accessTime);
454 }
455 
456 /*
457  * allow the shader to sample stats from constituent devices
458  */
459 void
460 Shader::sampleLoad(const Tick accessTime)
461 {
462  stats.loadLatencyDist.sample(accessTime);
463  stats.allLatencyDist.sample(accessTime);
464 }
465 
466 void
468 {
469  // Only sample instructions that go all the way to main memory
470  if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
471  return;
472  }
473 
474  Tick t1 = roundTripTime[0];
475  Tick t2 = roundTripTime[1];
476  Tick t3 = roundTripTime[2];
477  Tick t4 = roundTripTime[3];
478  Tick t5 = roundTripTime[4];
479 
484 }
485 
486 void
488 {
489  stats.coalsrLineAddresses.sample(lineMap.size());
490  std::vector<Tick> netTimes;
491 
492  // For each cache block address generated by a vmem inst, calculate
493  // the round-trip time for that cache block.
494  for (auto& it : lineMap) {
495  const std::vector<Tick>& timeVec = it.second;
496  if (timeVec.size() == 2) {
497  netTimes.push_back(timeVec[1] - timeVec[0]);
498  }
499  }
500 
501  // Sort the cache block round trip times so that the first
502  // distrubtion is always measuring the fastests and the last
503  // distrubtion is always measuring the slowest cache block.
504  std::sort(netTimes.begin(), netTimes.end());
505 
506  // Sample the round trip time for each N cache blocks into the
507  // Nth distribution.
508  int idx = 0;
509  for (auto& time : netTimes) {
510  stats.cacheBlockRoundTrip[idx].sample(time);
511  ++idx;
512  }
513 }
514 
515 void
517  // If all CUs attached to his shader are asleep, update shaderActiveTicks
518  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
519  "Invalid activeCu size\n");
520  _activeCus--;
521  if (!_activeCus)
523 }
524 
526  : statistics::Group(parent),
527  ADD_STAT(allLatencyDist, "delay distribution for all"),
528  ADD_STAT(loadLatencyDist, "delay distribution for loads"),
529  ADD_STAT(storeLatencyDist, "delay distribution for stores"),
530  ADD_STAT(initToCoalesceLatency,
531  "Ticks from vmem inst initiateAcc to coalescer issue"),
532  ADD_STAT(rubyNetworkLatency,
533  "Ticks from coalescer issue to coalescer hit callback"),
534  ADD_STAT(gmEnqueueLatency,
535  "Ticks from coalescer hit callback to GM pipe enqueue"),
536  ADD_STAT(gmToCompleteLatency,
537  "Ticks queued in GM pipes ordered response buffer"),
538  ADD_STAT(coalsrLineAddresses,
539  "Number of cache lines for coalesced request"),
540  ADD_STAT(shaderActiveTicks,
541  "Total ticks that any CU attached to this shader is active"),
542  ADD_STAT(vectorInstSrcOperand,
543  "vector instruction source operand distribution"),
544  ADD_STAT(vectorInstDstOperand,
545  "vector instruction destination operand distribution")
546 {
548  .init(0, 1600000, 10000)
550 
552  .init(0, 1600000, 10000)
554 
556  .init(0, 1600000, 10000)
558 
560  .init(0, 1600000, 10000)
562 
564  .init(0, 1600000, 10000)
566 
568  .init(0, 1600000, 10000)
570 
572  .init(0, 1600000, 10000)
574 
576  .init(0, 20, 1)
578 
581 
583  for (int idx = 0; idx < wf_size; ++idx) {
584  std::stringstream namestr;
585  ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
586  static_cast<Shader*>(parent)->name(), idx);
588  .init(0, 1600000, 10000)
589  .name(namestr.str())
590  .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
592  }
593 }
594 
595 } // namespace gem5
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
gem5::Shader::prepareFlush
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition: shader.cc:221
gem5::Shader::ShaderStats::ShaderStats
ShaderStats(statistics::Group *parent, int wf_size)
Definition: shader.cc:525
gem5::Shader::sa_when
std::vector< uint64_t > sa_when
Definition: shader.hh:217
hsa_queue_entry.hh
gem5::Shader::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:137
gem5::Event::when
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:508
gem5::BaseMMU::Read
@ Read
Definition: mmu.hh:56
gem5::Shader::~Shader
~Shader()
Definition: shader.cc:145
gem5::Shader::WriteMem
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:407
gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:108
gem5::Shader::sa_n
uint32_t sa_n
Definition: shader.hh:212
gem5::Shader::sa_x
std::vector< int32_t > sa_x
Definition: shader.hh:219
gem5::ArmISA::t5
Bitfield< 5 > t5
Definition: misc_types.hh:229
gem5::statistics::Distribution
A simple distribution stat.
Definition: statistics.hh:2084
gem5::Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:460
data
const char data[]
Definition: circlebuf.test.cc:48
shader.hh
gem5::Shader::ShaderStats::vectorInstDstOperand
statistics::Vector vectorInstDstOperand
Definition: shader.hh:315
gem5::X86ISA::L
Bitfield< 7, 0 > L
Definition: int.hh:59
gem5::Shader::_dispatcher
GPUDispatcher & _dispatcher
Definition: shader.hh:225
gem5::Shader::mmap
Addr mmap(int length)
Definition: shader.cc:104
gem5::Shader::ShaderStats::gmEnqueueLatency
statistics::Distribution gmEnqueueLatency
Definition: shader.hh:301
gem5::Shader::sa_val
std::vector< int * > sa_val
Definition: shader.hh:215
gem5::BaseMMU::Mode
Mode
Definition: mmu.hh:56
gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:366
gem5::BaseMMU::Write
@ Write
Definition: mmu.hh:56
gem5::HSAQueueEntry
Definition: hsa_queue_entry.hh:59
gem5::Shader::updateContext
void updateContext(int cid)
Definition: shader.cc:152
gpu_static_inst.hh
gem5::Shader::cuList
std::vector< ComputeUnit * > cuList
Definition: shader.hh:222
gem5::X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:775
gem5::Shader::functionalTLBAccess
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition: shader.cc:426
gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:516
gem5::Shader::execScheduledAdds
void execScheduledAdds()
Definition: shader.cc:160
gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
std::vector< Tick >
gem5::HSAQueueEntry::markWgDispatch
void markWgDispatch()
Definition: hsa_queue_entry.hh:260
gem5::Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:467
gem5::GPUDispatcher::getOutstandingWbs
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
Definition: dispatcher.cc:280
gem5::Shader::ShaderStats::vectorInstSrcOperand
statistics::Vector vectorInstSrcOperand
Definition: shader.hh:314
gem5::Shader::Shader
Shader(const Params &p)
Definition: shader.cc:55
gem5::Shader::tickEvent
EventFunctionWrapper tickEvent
Definition: shader.hh:186
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:67
gem5::Shader::ShaderStats::cacheBlockRoundTrip
statistics::Distribution * cacheBlockRoundTrip
Definition: shader.hh:311
sim_exit.hh
gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
gem5::ccprintf
void ccprintf(cp::Print &print)
Definition: cprintf.hh:130
gem5::Shader::ShaderStats::shaderActiveTicks
statistics::Scalar shaderActiveTicks
Definition: shader.hh:313
wavefront.hh
gem5::Process::memState
std::shared_ptr< MemState > memState
Definition: process.hh:290
gem5::Process::allocateMem
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:317
gem5::Shader::doFunctionalAccess
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:283
packet.hh
gem5::Shader::_lastInactiveTick
Tick _lastInactiveTick
Definition: shader.hh:94
gem5::ArmISA::t1
Bitfield< 1 > t1
Definition: misc_types.hh:233
gem5::ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition: chunk_generator.hh:59
gem5::MemCmd
Definition: packet.hh:75
gem5::HSAQueueEntry::isInvStarted
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
Definition: hsa_queue_entry.hh:324
gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1134
gem5::GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:266
gem5::Shader::ShaderStats::rubyNetworkLatency
statistics::Distribution rubyNetworkLatency
Definition: shader.hh:298
gem5::ArmISA::j
Bitfield< 24 > j
Definition: misc_types.hh:57
gem5::X86ISA::GpuTLB::TranslationState
TLB TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: tlb.hh:283
gem5::Shader::ReadMem
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:393
gem5::statistics::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:289
gem5::ChunkGenerator::done
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Definition: chunk_generator.hh:141
gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
gem5::Named::name
virtual std::string name() const
Definition: named.hh:47
gem5::ArmISA::t4
Bitfield< 4 > t4
Definition: misc_types.hh:230
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:283
gem5::probing::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
gem5::MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:326
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gem5::Shader::stats
gem5::Shader::ShaderStats stats
gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
gem5::MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:86
gem5::HSAQueueEntry::outstandingInvs
int outstandingInvs()
Definition: hsa_queue_entry.hh:314
gem5::Shader::cpuPointer
BaseCPU * cpuPointer
Definition: shader.hh:109
gem5::EventManager::reschedule
void reschedule(Event &event, Tick when, bool always=false)
Definition: eventq.hh:1037
gem5::Shader::ShaderStats::allLatencyDist
statistics::Distribution allLatencyDist
Definition: shader.hh:290
gpu_command_processor.hh
gem5::Shader::dispatchWorkgroups
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition: shader.cc:234
gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
gem5::HSAQueueEntry::dispComplete
bool dispComplete() const
Definition: hsa_queue_entry.hh:203
gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:223
gem5::Shader::ShaderStats::initToCoalesceLatency
statistics::Distribution initToCoalesceLatency
Definition: shader.hh:295
RubySystem.hh
gem5::Shader::prepareInvalidate
void prepareInvalidate(HSAQueueEntry *task)
Definition: shader.cc:190
gem5::Shader::ShaderStats::storeLatencyDist
statistics::Distribution storeLatencyDist
Definition: shader.hh:292
gem5::InstMemoryHopMax
@ InstMemoryHopMax
Definition: misc.hh:58
gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
tlb.hh
gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:534
gem5::GpuTranslationState
X86ISA::GpuTLB::TranslationState GpuTranslationState
Definition: tlb.hh:439
gem5::ArmISA::t2
Bitfield< 2 > t2
Definition: misc_types.hh:232
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234
gem5::ArmISA::t3
Bitfield< 3 > t3
Definition: misc_types.hh:231
gem5::Process
Definition: process.hh:68
gem5::ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
gem5::Shader::AccessMem
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:375
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
gem5::RiscvISA::x
Bitfield< 3 > x
Definition: pagetable.hh:73
gem5::X86ISA::GpuTLB::TranslationState::tlbEntry
TlbEntry * tlbEntry
Definition: tlb.hh:295
gem5::statistics::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:334
gem5::Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:356
gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:98
gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
gem5::GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:246
gem5::Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:487
gem5::MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:89
gem5::roundUp
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:260
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::Process::mmapGrowsDown
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition: process.hh:147
chunk_generator.hh
gem5::Shader::Params
ShaderParams Params
Definition: shader.hh:97
gem5::Shader::_activeCus
int _activeCus
Definition: shader.hh:91
gem5::Shader::ShaderStats::loadLatencyDist
statistics::Distribution loadLatencyDist
Definition: shader.hh:291
gem5::Shader::n_cu
int n_cu
Definition: shader.hh:201
dispatcher.hh
gem5::statistics::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:358
gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
gem5::X86ISA::PageBytes
const Addr PageBytes
Definition: page_size.hh:49
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: tlb.cc:60
gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:153
gem5::Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:450
gem5::Packet::setSuppressFuncError
void setSuppressFuncError()
Definition: packet.hh:744
gem5::Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
gem5::Shader::ShaderStats::gmToCompleteLatency
statistics::Distribution gmToCompleteLatency
Definition: shader.hh:304
gem5::ArmISA::mode
Bitfield< 4, 0 > mode
Definition: misc_types.hh:74
gem5::Shader
Definition: shader.hh:82
gem5::Shader::nextSchedCu
int nextSchedCu
Definition: shader.hh:209
gem5::Shader::ShaderStats::coalsrLineAddresses
statistics::Distribution coalsrLineAddresses
Definition: shader.hh:307

Generated on Wed May 4 2022 12:13:58 for gem5 by doxygen 1.8.17