gem5  v22.0.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "gpu-compute/shader.hh"
33 
34 #include <limits>
35 
38 #include "base/chunk_generator.hh"
39 #include "debug/GPUAgentDisp.hh"
40 #include "debug/GPUDisp.hh"
41 #include "debug/GPUMem.hh"
42 #include "debug/GPUShader.hh"
43 #include "debug/GPUWgLatency.hh"
48 #include "gpu-compute/wavefront.hh"
49 #include "mem/packet.hh"
51 #include "sim/sim_exit.hh"
52 
53 namespace gem5
54 {
55 
57  _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
58  gpuTc(nullptr), cpuPointer(p.cpu_pointer),
59  tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
60  false, Event::CPU_Tick_Pri),
61  timingSim(p.timing), hsail_mode(SIMT),
62  impl_kern_launch_acq(p.impl_kern_launch_acq),
63  impl_kern_end_rel(p.impl_kern_end_rel),
64  coissue_return(1),
65  trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
66  globalMemSize(p.globalmem),
67  nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
68  _dispatcher(*p.dispatcher), systemHub(p.system_hub),
69  max_valu_insts(p.max_valu_insts), total_valu_insts(0),
70  stats(this, p.CUs[0]->wfSize())
71 {
72  gpuCmdProc.setShader(this);
73  _dispatcher.setShader(this);
74 
75  _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
76  _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
77 
78  _ldsApe.base = ((Addr)1 << 61) + 0x0;
79  _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
80 
81  _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
82  _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
83 
84  shHiddenPrivateBaseVmid = 0;
85 
86  cuList.resize(n_cu);
87 
88  panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
89 
90  for (int i = 0; i < n_cu; ++i) {
91  cuList[i] = p.CUs[i];
92  assert(i == cuList[i]->cu_id);
93  cuList[i]->shader = this;
94  cuList[i]->idleCUTimeout = p.idlecu_timeout;
95  }
96 }
97 
98 GPUDispatcher&
100 {
101  return _dispatcher;
102 }
103 
104 Addr
105 Shader::mmap(int length)
106 {
107 
108  Addr start;
109 
110  // round up length to the next page
111  length = roundUp(length, X86ISA::PageBytes);
112 
113  Process *proc = gpuTc->getProcessPtr();
114  auto mem_state = proc->memState;
115 
116  if (proc->mmapGrowsDown()) {
117  DPRINTF(GPUShader, "GROWS DOWN");
118  start = mem_state->getMmapEnd() - length;
119  mem_state->setMmapEnd(start);
120  } else {
121  DPRINTF(GPUShader, "GROWS UP");
122  start = mem_state->getMmapEnd();
123  mem_state->setMmapEnd(start + length);
124 
125  // assertion to make sure we don't overwrite the stack (it grows down)
126  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
127  mem_state->getMmapEnd());
128  }
129 
130  DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
131 
132  proc->allocateMem(start, length);
133 
134  return start;
135 }
136 
137 void
139 {
140  // grab the threadContext of the thread running on the CPU
141  assert(cpuPointer);
142  gpuTc = cpuPointer->getContext(0);
143  assert(gpuTc);
144 }
145 
147 {
148  for (int j = 0; j < n_cu; ++j)
149  delete cuList[j];
150 }
151 
152 void
154  // context of the thread which dispatched work
155  assert(cpuPointer);
156  gpuTc = cpuPointer->getContext(cid);
157  assert(gpuTc);
158 }
159 
160 void
162 {
163  assert(!sa_when.empty());
164 
165  // apply any scheduled adds
166  for (int i = 0; i < sa_n; ++i) {
167  if (sa_when[i] <= curTick()) {
168  *sa_val[i] += sa_x[i];
169  panic_if(*sa_val[i] < 0, "Negative counter value\n");
170  sa_val.erase(sa_val.begin() + i);
171  sa_x.erase(sa_x.begin() + i);
172  sa_when.erase(sa_when.begin() + i);
173  --sa_n;
174  --i;
175  }
176  }
177  if (!sa_when.empty()) {
178  Tick shader_wakeup = *std::max_element(sa_when.begin(),
179  sa_when.end());
180  DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
181  schedule(tickEvent, shader_wakeup);
182  } else {
183  DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
184  }
185 }
186 
187 /*
188  * dispatcher/shader arranges invalidate requests to the CUs
189  */
190 void
192  // if invalidate has already started/finished, then do nothing
193  if (task->isInvStarted()) return;
194 
195  // invalidate has never started; it can only perform once at kernel launch
196  assert(task->outstandingInvs() == -1);
197  int kernId = task->dispatchId();
198  // counter value is 0 now, indicating the inv is about to start
199  _dispatcher.updateInvCounter(kernId, +1);
200 
201  // iterate all cus managed by the shader, to perform invalidate.
202  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
203  // create a request to hold INV info; the request's fields will
204  // be updated in cu before use
205  auto req = std::make_shared<Request>(0, 0, 0,
206  cuList[i_cu]->requestorId(),
207  0, -1);
208 
209  _dispatcher.updateInvCounter(kernId, +1);
210  // all necessary INV flags are all set now, call cu to execute
211  cuList[i_cu]->doInvalidate(req, task->dispatchId());
212 
213  // I don't like this. This is intrusive coding.
214  cuList[i_cu]->resetRegisterPool();
215  }
216 }
217 
221 void
223  int kernId = gpuDynInst->kern_id;
224  // flush has never been started, performed only once at kernel end
225  assert(_dispatcher.getOutstandingWbs(kernId) == 0);
226 
227  // the first cu, managed by the shader, performs flush operation,
228  // assuming that L2 cache is shared by all cus in the shader
229  int i_cu = 0;
230  _dispatcher.updateWbCounter(kernId, +1);
231  cuList[i_cu]->doFlush(gpuDynInst);
232 }
233 
234 bool
236 {
237  bool scheduledSomething = false;
238  int cuCount = 0;
239  int curCu = nextSchedCu;
240  int disp_count(0);
241 
242  while (cuCount < n_cu) {
243  //Every time we try a CU, update nextSchedCu
244  nextSchedCu = (nextSchedCu + 1) % n_cu;
245 
246  // dispatch workgroup iff the following two conditions are met:
247  // (a) wg_rem is true - there are unassigned workgroups in the grid
248  // (b) there are enough free slots in cu cuList[i] for this wg
249  int num_wfs_in_wg = 0;
250  bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
251  if (!task->dispComplete() && can_disp) {
252  scheduledSomething = true;
253  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
254  curCu, task->globalWgId());
255  DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
256  curCu, task->globalWgId());
257  DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
258  curTick(), task->globalWgId(), curCu);
259 
260  if (!cuList[curCu]->tickEvent.scheduled()) {
261  if (!_activeCus)
263  _activeCus++;
264  }
265 
266  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
267  "Invalid activeCu size\n");
268  cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
269 
270  task->markWgDispatch();
271  ++disp_count;
272  }
273 
274  ++cuCount;
275  curCu = nextSchedCu;
276  }
277 
278  DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
279 
280  return scheduledSomething;
281 }
282 
283 void
285  bool suppress_func_errors, int cu_id)
286 {
287  int block_size = cuList.at(cu_id)->cacheLineSize();
288  unsigned size = req->getSize();
289 
290  Addr tmp_addr;
291  BaseMMU::Mode trans_mode;
292 
293  if (cmd == MemCmd::ReadReq) {
294  trans_mode = BaseMMU::Read;
295  } else if (cmd == MemCmd::WriteReq) {
296  trans_mode = BaseMMU::Write;
297  } else {
298  fatal("unexcepted MemCmd\n");
299  }
300 
301  tmp_addr = req->getVaddr();
302  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
303 
304  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
305 
306  // Misaligned access
307  if (split_addr > tmp_addr) {
308  RequestPtr req1, req2;
309  req->splitOnVaddr(split_addr, req1, req2);
310 
311  PacketPtr pkt1 = new Packet(req2, cmd);
312  PacketPtr pkt2 = new Packet(req1, cmd);
313 
314  functionalTLBAccess(pkt1, cu_id, trans_mode);
315  functionalTLBAccess(pkt2, cu_id, trans_mode);
316 
317  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
318  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
319 
320  new_pkt1->dataStatic(data);
321  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
322 
323  if (suppress_func_errors) {
324  new_pkt1->setSuppressFuncError();
325  new_pkt2->setSuppressFuncError();
326  }
327 
328  // fixme: this should be cuList[cu_id] if cu_id != n_cu
329  // The latter requires a memPort in the dispatcher
330  cuList[0]->memPort[0].sendFunctional(new_pkt1);
331  cuList[0]->memPort[0].sendFunctional(new_pkt2);
332 
333  delete new_pkt1;
334  delete new_pkt2;
335  delete pkt1;
336  delete pkt2;
337  } else {
338  PacketPtr pkt = new Packet(req, cmd);
339  functionalTLBAccess(pkt, cu_id, trans_mode);
340  PacketPtr new_pkt = new Packet(pkt->req, cmd);
341  new_pkt->dataStatic(data);
342 
343  if (suppress_func_errors) {
344  new_pkt->setSuppressFuncError();
345  };
346 
347  // fixme: this should be cuList[cu_id] if cu_id != n_cu
348  // The latter requires a memPort in the dispatcher
349  cuList[0]->memPort[0].sendFunctional(new_pkt);
350 
351  delete new_pkt;
352  delete pkt;
353  }
354 }
355 
356 void
358 {
359  sa_val.push_back(val);
360  when += curTick();
361  sa_when.push_back(when);
362  sa_x.push_back(x);
363  ++sa_n;
364  if (!tickEvent.scheduled() || (when < tickEvent.when())) {
365  DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
366  "%lu\n", when);
367  reschedule(tickEvent, when, true);
368  } else {
369  assert(tickEvent.scheduled());
370  DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
371  "%lu\n", when);
372  }
373 }
374 
375 void
376 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
377  MemCmd cmd, bool suppress_func_errors)
378 {
379  uint8_t *data_buf = (uint8_t*)ptr;
380 
381  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
382  !gen.done(); gen.next()) {
383 
384  RequestPtr req = std::make_shared<Request>(
385  gen.addr(), gen.size(), 0,
386  cuList[0]->requestorId(), 0, 0, nullptr);
387 
388  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
389  data_buf += gen.size();
390  }
391 }
392 
393 void
394 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
395 {
396  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
397 }
398 
399 void
400 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
401  bool suppress_func_errors)
402 {
403  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
404  suppress_func_errors);
405 }
406 
407 void
408 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
409 {
410  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
411 }
412 
413 void
414 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
415  bool suppress_func_errors)
416 {
417  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
418  suppress_func_errors);
419 }
420 
421 /*
422  * Send a packet through the appropriate TLB functional port.
423  * If cu_id=n_cu, then this is the dispatcher's TLB.
424  * Otherwise it's the TLB of the cu_id compute unit.
425  */
426 void
428 {
429  // update senderState. Need to know the gpuTc and the TLB mode
430  pkt->senderState =
431  new GpuTranslationState(mode, gpuTc, false);
432 
433  // even when the perLaneTLB flag is turned on
434  // it's ok tp send all accesses through lane 0
435  // since the lane # is not known here,
436  // This isn't important since these are functional accesses.
437  cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
438 
439  /* safe_cast the senderState */
440  GpuTranslationState *sender_state =
441  safe_cast<GpuTranslationState*>(pkt->senderState);
442 
443  delete sender_state->tlbEntry;
444  delete pkt->senderState;
445 }
446 
447 /*
448  * allow the shader to sample stats from constituent devices
449  */
450 void
451 Shader::sampleStore(const Tick accessTime)
452 {
453  stats.storeLatencyDist.sample(accessTime);
454  stats.allLatencyDist.sample(accessTime);
455 }
456 
457 /*
458  * allow the shader to sample stats from constituent devices
459  */
460 void
461 Shader::sampleLoad(const Tick accessTime)
462 {
463  stats.loadLatencyDist.sample(accessTime);
464  stats.allLatencyDist.sample(accessTime);
465 }
466 
467 void
469 {
470  // Only sample instructions that go all the way to main memory
471  if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
472  return;
473  }
474 
475  Tick t1 = roundTripTime[0];
476  Tick t2 = roundTripTime[1];
477  Tick t3 = roundTripTime[2];
478  Tick t4 = roundTripTime[3];
479  Tick t5 = roundTripTime[4];
480 
485 }
486 
487 void
489 {
490  stats.coalsrLineAddresses.sample(lineMap.size());
491  std::vector<Tick> netTimes;
492 
493  // For each cache block address generated by a vmem inst, calculate
494  // the round-trip time for that cache block.
495  for (auto& it : lineMap) {
496  const std::vector<Tick>& timeVec = it.second;
497  if (timeVec.size() == 2) {
498  netTimes.push_back(timeVec[1] - timeVec[0]);
499  }
500  }
501 
502  // Sort the cache block round trip times so that the first
503  // distrubtion is always measuring the fastests and the last
504  // distrubtion is always measuring the slowest cache block.
505  std::sort(netTimes.begin(), netTimes.end());
506 
507  // Sample the round trip time for each N cache blocks into the
508  // Nth distribution.
509  int idx = 0;
510  for (auto& time : netTimes) {
511  stats.cacheBlockRoundTrip[idx].sample(time);
512  ++idx;
513  }
514 }
515 
516 void
518  // If all CUs attached to his shader are asleep, update shaderActiveTicks
519  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
520  "Invalid activeCu size\n");
521  _activeCus--;
522  if (!_activeCus)
524 }
525 
531 {
532  return gpuCmdProc.vramRequestorId();
533 }
534 
536  : statistics::Group(parent),
537  ADD_STAT(allLatencyDist, "delay distribution for all"),
538  ADD_STAT(loadLatencyDist, "delay distribution for loads"),
539  ADD_STAT(storeLatencyDist, "delay distribution for stores"),
540  ADD_STAT(initToCoalesceLatency,
541  "Ticks from vmem inst initiateAcc to coalescer issue"),
542  ADD_STAT(rubyNetworkLatency,
543  "Ticks from coalescer issue to coalescer hit callback"),
544  ADD_STAT(gmEnqueueLatency,
545  "Ticks from coalescer hit callback to GM pipe enqueue"),
546  ADD_STAT(gmToCompleteLatency,
547  "Ticks queued in GM pipes ordered response buffer"),
548  ADD_STAT(coalsrLineAddresses,
549  "Number of cache lines for coalesced request"),
550  ADD_STAT(shaderActiveTicks,
551  "Total ticks that any CU attached to this shader is active"),
552  ADD_STAT(vectorInstSrcOperand,
553  "vector instruction source operand distribution"),
554  ADD_STAT(vectorInstDstOperand,
555  "vector instruction destination operand distribution")
556 {
558  .init(0, 1600000, 10000)
560 
562  .init(0, 1600000, 10000)
564 
566  .init(0, 1600000, 10000)
568 
570  .init(0, 1600000, 10000)
572 
574  .init(0, 1600000, 10000)
576 
578  .init(0, 1600000, 10000)
580 
582  .init(0, 1600000, 10000)
584 
586  .init(0, 20, 1)
588 
591 
593  for (int idx = 0; idx < wf_size; ++idx) {
594  std::stringstream namestr;
595  ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
596  static_cast<Shader*>(parent)->name(), idx);
598  .init(0, 1600000, 10000)
599  .name(namestr.str())
600  .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
602  }
603 }
604 
605 } // namespace gem5
gem5::curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
gem5::Shader::prepareFlush
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition: shader.cc:222
gem5::Shader::ShaderStats::ShaderStats
ShaderStats(statistics::Group *parent, int wf_size)
Definition: shader.cc:535
gem5::Shader::sa_when
std::vector< uint64_t > sa_when
Definition: shader.hh:220
hsa_queue_entry.hh
gem5::Shader::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:138
gem5::Event::when
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:508
gem5::BaseMMU::Read
@ Read
Definition: mmu.hh:56
gem5::Shader::~Shader
~Shader()
Definition: shader.cc:146
gem5::Shader::WriteMem
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:408
gem5::Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:109
gem5::Shader::sa_n
uint32_t sa_n
Definition: shader.hh:215
gem5::Shader::sa_x
std::vector< int32_t > sa_x
Definition: shader.hh:222
gem5::ArmISA::t5
Bitfield< 5 > t5
Definition: misc_types.hh:229
gem5::statistics::Distribution
A simple distribution stat.
Definition: statistics.hh:2084
gem5::Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:461
data
const char data[]
Definition: circlebuf.test.cc:48
shader.hh
gem5::Shader::ShaderStats::vectorInstDstOperand
statistics::Vector vectorInstDstOperand
Definition: shader.hh:319
gem5::X86ISA::L
Bitfield< 7, 0 > L
Definition: int.hh:61
gem5::Shader::_dispatcher
GPUDispatcher & _dispatcher
Definition: shader.hh:228
gem5::Shader::mmap
Addr mmap(int length)
Definition: shader.cc:105
gem5::Shader::ShaderStats::gmEnqueueLatency
statistics::Distribution gmEnqueueLatency
Definition: shader.hh:305
gem5::GpuTranslationState
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition: gpu_translation_state.hh:58
gem5::Shader::sa_val
std::vector< int * > sa_val
Definition: shader.hh:218
gem5::BaseMMU::Mode
Mode
Definition: mmu.hh:56
gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:374
gem5::Shader::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from CP.
Definition: shader.cc:530
gem5::BaseMMU::Write
@ Write
Definition: mmu.hh:56
gem5::HSAQueueEntry
Definition: hsa_queue_entry.hh:59
gem5::Shader::updateContext
void updateContext(int cid)
Definition: shader.cc:153
gpu_static_inst.hh
gem5::Shader::cuList
std::vector< ComputeUnit * > cuList
Definition: shader.hh:225
gem5::X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:769
gem5::Shader::functionalTLBAccess
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseMMU::Mode mode)
Definition: shader.cc:427
gem5::Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:517
gem5::Shader::execScheduledAdds
void execScheduledAdds()
Definition: shader.cc:161
gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1019
std::vector< Tick >
gem5::HSAQueueEntry::markWgDispatch
void markWgDispatch()
Definition: hsa_queue_entry.hh:260
gem5::Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:468
gem5::GPUDispatcher::getOutstandingWbs
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
Definition: dispatcher.cc:280
gem5::Shader::ShaderStats::vectorInstSrcOperand
statistics::Vector vectorInstSrcOperand
Definition: shader.hh:318
gem5::Shader::Shader
Shader(const Params &p)
Definition: shader.cc:56
gem5::Shader::tickEvent
EventFunctionWrapper tickEvent
Definition: shader.hh:189
gem5::ArmISA::i
Bitfield< 7 > i
Definition: misc_types.hh:67
gem5::Shader::ShaderStats::cacheBlockRoundTrip
statistics::Distribution * cacheBlockRoundTrip
Definition: shader.hh:315
sim_exit.hh
gem5::statistics::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1328
gem5::ccprintf
void ccprintf(cp::Print &print)
Definition: cprintf.hh:130
gem5::Shader::ShaderStats::shaderActiveTicks
statistics::Scalar shaderActiveTicks
Definition: shader.hh:317
wavefront.hh
gem5::Process::memState
std::shared_ptr< MemState > memState
Definition: process.hh:290
gem5::Process::allocateMem
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:317
gem5::Shader::doFunctionalAccess
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:284
packet.hh
gem5::Shader::_lastInactiveTick
Tick _lastInactiveTick
Definition: shader.hh:95
gem5::ArmISA::t1
Bitfield< 1 > t1
Definition: misc_types.hh:233
gem5::ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition: chunk_generator.hh:59
gem5::MemCmd
Definition: packet.hh:75
gem5::HSAQueueEntry::isInvStarted
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
Definition: hsa_queue_entry.hh:324
gem5::statistics::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:62
gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1147
gem5::GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:266
gem5::Shader::ShaderStats::rubyNetworkLatency
statistics::Distribution rubyNetworkLatency
Definition: shader.hh:302
gem5::ArmISA::j
Bitfield< 24 > j
Definition: misc_types.hh:57
gem5::Shader::ReadMem
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:394
gem5::statistics::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:289
gem5::ChunkGenerator::done
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Definition: chunk_generator.hh:141
gem5::statistics::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
gem5::Named::name
virtual std::string name() const
Definition: named.hh:47
gem5::VegaISA::p
Bitfield< 54 > p
Definition: pagetable.hh:70
gem5::ArmISA::t4
Bitfield< 4 > t4
Definition: misc_types.hh:230
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:186
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:75
gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:291
gem5::probing::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
gem5::Tick
uint64_t Tick
Tick count type.
Definition: types.hh:58
gem5::VegaISA::x
Bitfield< 4 > x
Definition: pagetable.hh:61
gem5::Shader::stats
gem5::Shader::ShaderStats stats
gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
gem5::MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:86
gem5::GpuTranslationState::tlbEntry
Serializable * tlbEntry
Definition: gpu_translation_state.hh:73
gem5::HSAQueueEntry::outstandingInvs
int outstandingInvs()
Definition: hsa_queue_entry.hh:314
gem5::Shader::cpuPointer
BaseCPU * cpuPointer
Definition: shader.hh:110
gem5::EventManager::reschedule
void reschedule(Event &event, Tick when, bool always=false)
Definition: eventq.hh:1037
gem5::Shader::ShaderStats::allLatencyDist
statistics::Distribution allLatencyDist
Definition: shader.hh:294
gpu_command_processor.hh
gem5::Shader::dispatchWorkgroups
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition: shader.cc:235
gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:279
gpu_translation_state.hh
gem5::HSAQueueEntry::dispComplete
bool dispComplete() const
Definition: hsa_queue_entry.hh:203
gem5::HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:223
gem5::Shader::ShaderStats::initToCoalesceLatency
statistics::Distribution initToCoalesceLatency
Definition: shader.hh:299
RubySystem.hh
gem5::Shader::prepareInvalidate
void prepareInvalidate(HSAQueueEntry *task)
Definition: shader.cc:191
gem5::Shader::ShaderStats::storeLatencyDist
statistics::Distribution storeLatencyDist
Definition: shader.hh:296
gem5::InstMemoryHopMax
@ InstMemoryHopMax
Definition: misc.hh:58
gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
tlb.hh
gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:542
gem5::ArmISA::t2
Bitfield< 2 > t2
Definition: misc_types.hh:232
gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:49
gem5::ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:234
gem5::ArmISA::t3
Bitfield< 3 > t3
Definition: misc_types.hh:231
gem5::Process
Definition: process.hh:68
gem5::ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
gem5::Shader::AccessMem
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:376
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
gem5::statistics::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:334
gem5::Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:357
gem5::Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:99
gem5::statistics::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:72
gem5::GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:246
gem5::Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:488
gem5::Shader::gpuCmdProc
GPUCommandProcessor & gpuCmdProc
Definition: shader.hh:227
gem5::MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:89
gem5::roundUp
static constexpr T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:260
gem5::statistics::Group
Statistics container.
Definition: group.hh:93
gem5::GPUCommandProcessor::vramRequestorId
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Definition: gpu_command_processor.cc:74
gem5::Process::mmapGrowsDown
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition: process.hh:147
chunk_generator.hh
gem5::Shader::Params
ShaderParams Params
Definition: shader.hh:98
gem5::Shader::_activeCus
int _activeCus
Definition: shader.hh:92
gem5::Shader::ShaderStats::loadLatencyDist
statistics::Distribution loadLatencyDist
Definition: shader.hh:295
gem5::Shader::n_cu
int n_cu
Definition: shader.hh:204
gem5::RequestorID
uint16_t RequestorID
Definition: request.hh:95
dispatcher.hh
gem5::statistics::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:358
gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:204
gem5::X86ISA::PageBytes
const Addr PageBytes
Definition: page_size.hh:49
gem5
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
Definition: gpu_translation_state.hh:37
gem5::statistics::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1040
gem5::HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:153
gem5::Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:451
gem5::Packet::setSuppressFuncError
void setSuppressFuncError()
Definition: packet.hh:753
gem5::Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:465
gem5::Shader::ShaderStats::gmToCompleteLatency
statistics::Distribution gmToCompleteLatency
Definition: shader.hh:308
gem5::ArmISA::mode
Bitfield< 4, 0 > mode
Definition: misc_types.hh:74
gem5::Shader
Definition: shader.hh:83
gem5::Shader::nextSchedCu
int nextSchedCu
Definition: shader.hh:212
gem5::Shader::ShaderStats::coalsrLineAddresses
statistics::Distribution coalsrLineAddresses
Definition: shader.hh:311

Generated on Wed Jul 13 2022 10:39:22 for gem5 by doxygen 1.8.17