gem5  v20.1.0.0
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "gpu-compute/shader.hh"
35 
36 #include <limits>
37 
38 #include "arch/x86/isa_traits.hh"
39 #include "arch/x86/linux/linux.hh"
40 #include "base/chunk_generator.hh"
41 #include "debug/GPUDisp.hh"
42 #include "debug/GPUMem.hh"
43 #include "debug/GPUShader.hh"
44 #include "debug/GPUWgLatency.hh"
49 #include "gpu-compute/wavefront.hh"
50 #include "mem/packet.hh"
52 #include "sim/sim_exit.hh"
53 
55  _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
56  gpuTc(nullptr), cpuPointer(p->cpu_pointer),
57  tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
58  false, Event::CPU_Tick_Pri),
59  timingSim(p->timing), hsail_mode(SIMT),
60  impl_kern_launch_acq(p->impl_kern_launch_acq),
61  impl_kern_end_rel(p->impl_kern_end_rel),
62  coissue_return(1),
63  trace_vgpr_all(1), n_cu((p->CUs).size()), n_wf(p->n_wf),
64  globalMemSize(p->globalmem),
65  nextSchedCu(0), sa_n(0), gpuCmdProc(*p->gpu_cmd_proc),
66  _dispatcher(*p->dispatcher),
67  max_valu_insts(p->max_valu_insts), total_valu_insts(0)
68 {
69  gpuCmdProc.setShader(this);
70  _dispatcher.setShader(this);
71 
72  _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
73  _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
74 
75  _ldsApe.base = ((Addr)1 << 61) + 0x0;
76  _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
77 
78  _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
79  _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
80 
81  shHiddenPrivateBaseVmid = 0;
82 
83  cuList.resize(n_cu);
84 
85  panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
86 
87  for (int i = 0; i < n_cu; ++i) {
88  cuList[i] = p->CUs[i];
89  assert(i == cuList[i]->cu_id);
90  cuList[i]->shader = this;
91  cuList[i]->idleCUTimeout = p->idlecu_timeout;
92  }
93 }
94 
97 {
98  return _dispatcher;
99 }
100 
101 Addr
103 {
104 
105  Addr start;
106 
107  // round up length to the next page
109 
110  Process *proc = gpuTc->getProcessPtr();
111  auto mem_state = proc->memState;
112 
113  if (proc->mmapGrowsDown()) {
114  DPRINTF(GPUShader, "GROWS DOWN");
115  start = mem_state->getMmapEnd() - length;
116  mem_state->setMmapEnd(start);
117  } else {
118  DPRINTF(GPUShader, "GROWS UP");
119  start = mem_state->getMmapEnd();
120  mem_state->setMmapEnd(start + length);
121 
122  // assertion to make sure we don't overwrite the stack (it grows down)
123  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
124  mem_state->getMmapEnd());
125  }
126 
127  DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
128 
129  proc->allocateMem(start, length);
130 
131  return start;
132 }
133 
134 void
136 {
137  // grab the threadContext of the thread running on the CPU
138  assert(cpuPointer);
140  assert(gpuTc);
141 }
142 
144 {
145  for (int j = 0; j < n_cu; ++j)
146  delete cuList[j];
147 }
148 
149 void
151  // context of the thread which dispatched work
152  assert(cpuPointer);
153  gpuTc = cpuPointer->getContext(cid);
154  assert(gpuTc);
155 }
156 
157 Shader*
158 ShaderParams::create()
159 {
160  return new Shader(this);
161 }
162 
163 void
165 {
166  assert(!sa_when.empty());
167 
168  // apply any scheduled adds
169  for (int i = 0; i < sa_n; ++i) {
170  if (sa_when[i] <= curTick()) {
171  *sa_val[i] += sa_x[i];
172  panic_if(*sa_val[i] < 0, "Negative counter value\n");
173  sa_val.erase(sa_val.begin() + i);
174  sa_x.erase(sa_x.begin() + i);
175  sa_when.erase(sa_when.begin() + i);
176  --sa_n;
177  --i;
178  }
179  }
180  if (!sa_when.empty()) {
181  Tick shader_wakeup = *std::max_element(sa_when.begin(),
182  sa_when.end());
183  DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
184  schedule(tickEvent, shader_wakeup);
185  } else {
186  DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
187  }
188 }
189 
190 /*
191  * dispatcher/shader arranges invalidate requests to the CUs
192  */
193 void
195  // if invalidate has already started/finished, then do nothing
196  if (task->isInvStarted()) return;
197 
198  // invalidate has never started; it can only perform once at kernel launch
199  assert(task->outstandingInvs() == -1);
200  int kernId = task->dispatchId();
201  // counter value is 0 now, indicating the inv is about to start
202  _dispatcher.updateInvCounter(kernId, +1);
203 
204  // iterate all cus managed by the shader, to perform invalidate.
205  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
206  // create a request to hold INV info; the request's fields will
207  // be updated in cu before use
208  auto req = std::make_shared<Request>(0, 0, 0,
209  cuList[i_cu]->requestorId(),
210  0, -1);
211 
212  _dispatcher.updateInvCounter(kernId, +1);
213  // all necessary INV flags are all set now, call cu to execute
214  cuList[i_cu]->doInvalidate(req, task->dispatchId());
215  }
216 }
217 
221 void
223  int kernId = gpuDynInst->kern_id;
224  // flush has never been started, performed only once at kernel end
225  assert(_dispatcher.getOutstandingWbs(kernId) == 0);
226 
227  // the first cu, managed by the shader, performs flush operation,
228  // assuming that L2 cache is shared by all cus in the shader
229  int i_cu = 0;
230  _dispatcher.updateWbCounter(kernId, +1);
231  cuList[i_cu]->doFlush(gpuDynInst);
232 }
233 
234 bool
236 {
237  bool scheduledSomething = false;
238  int cuCount = 0;
239  int curCu = nextSchedCu;
240 
241  while (cuCount < n_cu) {
242  //Every time we try a CU, update nextSchedCu
243  nextSchedCu = (nextSchedCu + 1) % n_cu;
244 
245  // dispatch workgroup iff the following two conditions are met:
246  // (a) wg_rem is true - there are unassigned workgroups in the grid
247  // (b) there are enough free slots in cu cuList[i] for this wg
248  int num_wfs_in_wg = 0;
249  bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
250  if (!task->dispComplete() && can_disp) {
251  scheduledSomething = true;
252  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
253  curCu, task->globalWgId());
254  DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
255  curTick(), task->globalWgId(), curCu);
256 
257  if (!cuList[curCu]->tickEvent.scheduled()) {
258  if (!_activeCus)
260  _activeCus++;
261  }
262 
263  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
264  "Invalid activeCu size\n");
265  cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
266 
267  task->markWgDispatch();
268  }
269 
270  ++cuCount;
271  curCu = nextSchedCu;
272  }
273 
274  return scheduledSomething;
275 }
276 
277 void
279 {
281 
283  .name(name() + ".shader_active_ticks")
284  .desc("Total ticks that any CU attached to this shader is active")
285  ;
287  .init(0, 1600000, 10000)
288  .name(name() + ".allLatencyDist")
289  .desc("delay distribution for all")
291 
293  .init(0, 1600000, 10000)
294  .name(name() + ".loadLatencyDist")
295  .desc("delay distribution for loads")
297 
299  .init(0, 1600000, 10000)
300  .name(name() + ".storeLatencyDist")
301  .desc("delay distribution for stores")
303 
305  .init(4)
306  .name(name() + ".vec_inst_src_operand")
307  .desc("vector instruction source operand distribution");
308 
310  .init(4)
311  .name(name() + ".vec_inst_dst_operand")
312  .desc("vector instruction destination operand distribution");
313 
315  .init(0, 1600000, 10000)
316  .name(name() + ".initToCoalesceLatency")
317  .desc("Ticks from vmem inst initiateAcc to coalescer issue")
319 
321  .init(0, 1600000, 10000)
322  .name(name() + ".rubyNetworkLatency")
323  .desc("Ticks from coalescer issue to coalescer hit callback")
325 
327  .init(0, 1600000, 10000)
328  .name(name() + ".gmEnqueueLatency")
329  .desc("Ticks from coalescer hit callback to GM pipe enqueue")
331 
333  .init(0, 1600000, 10000)
334  .name(name() + ".gmToCompleteLatency")
335  .desc("Ticks queued in GM pipes ordered response buffer")
337 
339  .init(0, 20, 1)
340  .name(name() + ".coalsrLineAddresses")
341  .desc("Number of cache lines for coalesced request")
343 
344  int wfSize = cuList[0]->wfSize();
346  for (int idx = 0; idx < wfSize; ++idx) {
347  std::stringstream namestr;
348  ccprintf(namestr, "%s.cacheBlockRoundTrip%d", name(), idx);
350  .init(0, 1600000, 10000)
351  .name(namestr.str())
352  .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
354  }
355 }
356 
357 void
359  bool suppress_func_errors, int cu_id)
360 {
361  int block_size = cuList.at(cu_id)->cacheLineSize();
362  unsigned size = req->getSize();
363 
364  Addr tmp_addr;
365  BaseTLB::Mode trans_mode;
366 
367  if (cmd == MemCmd::ReadReq) {
368  trans_mode = BaseTLB::Read;
369  } else if (cmd == MemCmd::WriteReq) {
370  trans_mode = BaseTLB::Write;
371  } else {
372  fatal("unexcepted MemCmd\n");
373  }
374 
375  tmp_addr = req->getVaddr();
376  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
377 
378  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
379 
380  // Misaligned access
381  if (split_addr > tmp_addr) {
382  RequestPtr req1, req2;
383  req->splitOnVaddr(split_addr, req1, req2);
384 
385  PacketPtr pkt1 = new Packet(req2, cmd);
386  PacketPtr pkt2 = new Packet(req1, cmd);
387 
388  functionalTLBAccess(pkt1, cu_id, trans_mode);
389  functionalTLBAccess(pkt2, cu_id, trans_mode);
390 
391  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
392  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
393 
394  new_pkt1->dataStatic(data);
395  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
396 
397  if (suppress_func_errors) {
398  new_pkt1->setSuppressFuncError();
399  new_pkt2->setSuppressFuncError();
400  }
401 
402  // fixme: this should be cuList[cu_id] if cu_id != n_cu
403  // The latter requires a memPort in the dispatcher
404  cuList[0]->memPort[0].sendFunctional(new_pkt1);
405  cuList[0]->memPort[0].sendFunctional(new_pkt2);
406 
407  delete new_pkt1;
408  delete new_pkt2;
409  delete pkt1;
410  delete pkt2;
411  } else {
412  PacketPtr pkt = new Packet(req, cmd);
413  functionalTLBAccess(pkt, cu_id, trans_mode);
414  PacketPtr new_pkt = new Packet(pkt->req, cmd);
415  new_pkt->dataStatic(data);
416 
417  if (suppress_func_errors) {
418  new_pkt->setSuppressFuncError();
419  };
420 
421  // fixme: this should be cuList[cu_id] if cu_id != n_cu
422  // The latter requires a memPort in the dispatcher
423  cuList[0]->memPort[0].sendFunctional(new_pkt);
424 
425  delete new_pkt;
426  delete pkt;
427  }
428 }
429 
430 void
432 {
433  sa_val.push_back(val);
434  when += curTick();
435  sa_when.push_back(when);
436  sa_x.push_back(x);
437  ++sa_n;
438  if (!tickEvent.scheduled() || (when < tickEvent.when())) {
439  DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
440  "%lu\n", when);
441  reschedule(tickEvent, when, true);
442  } else {
443  assert(tickEvent.scheduled());
444  DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
445  "%lu\n", when);
446  }
447 }
448 
449 void
450 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
451  MemCmd cmd, bool suppress_func_errors)
452 {
453  uint8_t *data_buf = (uint8_t*)ptr;
454 
455  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
456  !gen.done(); gen.next()) {
457 
458  RequestPtr req = std::make_shared<Request>(
459  gen.addr(), gen.size(), 0,
460  cuList[0]->requestorId(), 0, 0, nullptr);
461 
462  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
463  data_buf += gen.size();
464  }
465 }
466 
467 void
468 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
469 {
470  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
471 }
472 
473 void
474 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
475  bool suppress_func_errors)
476 {
477  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
478  suppress_func_errors);
479 }
480 
481 void
482 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
483 {
484  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
485 }
486 
487 void
488 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
489  bool suppress_func_errors)
490 {
491  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
492  suppress_func_errors);
493 }
494 
495 /*
496  * Send a packet through the appropriate TLB functional port.
497  * If cu_id=n_cu, then this is the dispatcher's TLB.
498  * Otherwise it's the TLB of the cu_id compute unit.
499  */
500 void
502 {
503  // update senderState. Need to know the gpuTc and the TLB mode
504  pkt->senderState =
505  new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
506 
507  // even when the perLaneTLB flag is turned on
508  // it's ok tp send all accesses through lane 0
509  // since the lane # is not known here,
510  // This isn't important since these are functional accesses.
511  cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
512 
513  /* safe_cast the senderState */
514  TheISA::GpuTLB::TranslationState *sender_state =
515  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
516 
517  delete sender_state->tlbEntry;
518  delete pkt->senderState;
519 }
520 
521 /*
522  * allow the shader to sample stats from constituent devices
523  */
524 void
525 Shader::sampleStore(const Tick accessTime)
526 {
527  storeLatencyDist.sample(accessTime);
528  allLatencyDist.sample(accessTime);
529 }
530 
531 /*
532  * allow the shader to sample stats from constituent devices
533  */
534 void
535 Shader::sampleLoad(const Tick accessTime)
536 {
537  loadLatencyDist.sample(accessTime);
538  allLatencyDist.sample(accessTime);
539 }
540 
541 void
543 {
544  // Only sample instructions that go all the way to main memory
545  if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
546  return;
547  }
548 
549  Tick t1 = roundTripTime[0];
550  Tick t2 = roundTripTime[1];
551  Tick t3 = roundTripTime[2];
552  Tick t4 = roundTripTime[3];
553  Tick t5 = roundTripTime[4];
554 
559 }
560 
561 void
563 {
564  coalsrLineAddresses.sample(lineMap.size());
565  std::vector<Tick> netTimes;
566 
567  // For each cache block address generated by a vmem inst, calculate
568  // the round-trip time for that cache block.
569  for (auto& it : lineMap) {
570  const std::vector<Tick>& timeVec = it.second;
571  if (timeVec.size() == 2) {
572  netTimes.push_back(timeVec[1] - timeVec[0]);
573  }
574  }
575 
576  // Sort the cache block round trip times so that the first
577  // distrubtion is always measuring the fastests and the last
578  // distrubtion is always measuring the slowest cache block.
579  std::sort(netTimes.begin(), netTimes.end());
580 
581  // Sample the round trip time for each N cache blocks into the
582  // Nth distribution.
583  int idx = 0;
584  for (auto& time : netTimes) {
585  cacheBlockRoundTrip[idx].sample(time);
586  ++idx;
587  }
588 }
589 
590 void
592  // If all CUs attached to his shader are asleep, update shaderActiveTicks
593  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
594  "Invalid activeCu size\n");
595  _activeCus--;
596  if (!_activeCus)
598 }
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:183
Stats::Group::regStats
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64
Shader::vectorInstDstOperand
Stats::Vector vectorInstDstOperand
Definition: shader.hh:257
Packet::setSuppressFuncError
void setSuppressFuncError()
Definition: packet.hh:717
hsa_queue_entry.hh
Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:460
Shader::sa_x
std::vector< int32_t > sa_x
Definition: shader.hh:244
roundDown
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:150
X86ISA::L
Bitfield< 7, 0 > L
Definition: int.hh:57
length
uint8_t length
Definition: inet.hh:422
Shader::functionalTLBAccess
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
Definition: shader.cc:501
Shader::updateContext
void updateContext(int cid)
Definition: shader.cc:150
BaseTLB::Read
@ Read
Definition: tlb.hh:57
Shader
Definition: shader.hh:87
EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:199
data
const char data[]
Definition: circlebuf.test.cc:42
shader.hh
ArmISA::t2
Bitfield< 2 > t2
Definition: miscregs_types.hh:228
Shader::gmToCompleteLatency
Stats::Distribution gmToCompleteLatency
Definition: shader.hh:113
Shader::tickEvent
EventFunctionWrapper tickEvent
Definition: shader.hh:211
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
Process
Definition: process.hh:65
EventManager::reschedule
void reschedule(Event &event, Tick when, bool always=false)
Definition: eventq.hh:1023
Shader::mmap
Addr mmap(int length)
Definition: shader.cc:102
ArmISA::t3
Bitfield< 3 > t3
Definition: miscregs_types.hh:227
InstMemoryHopMax
@ InstMemoryHopMax
Definition: misc.hh:56
Shader::_lastInactiveTick
Tick _lastInactiveTick
Definition: shader.hh:99
Shader::prepareFlush
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition: shader.cc:222
gpu_static_inst.hh
BaseTLB::Mode
Mode
Definition: tlb.hh:57
MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:82
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:63
Shader::allLatencyDist
Stats::Distribution allLatencyDist
Definition: shader.hh:102
Shader::WriteMem
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:482
Shader::nextSchedCu
int nextSchedCu
Definition: shader.hh:234
Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:562
RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:82
Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:340
Shader::gmEnqueueLatency
Stats::Distribution gmEnqueueLatency
Definition: shader.hh:112
std::vector< Tick >
Shader::prepareInvalidate
void prepareInvalidate(HSAQueueEntry *task)
Definition: shader.cc:194
ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
BaseCPU::getContext
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition: base.hh:283
Event::when
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:503
sim_exit.hh
HSAQueueEntry
Definition: hsa_queue_entry.hh:60
Shader::ReadMem
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:468
Shader::doFunctionalAccess
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:358
Shader::execScheduledAdds
void execScheduledAdds()
Definition: shader.cc:164
GPUDispatcher
Definition: dispatcher.hh:60
wavefront.hh
ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:231
Shader::storeLatencyDist
Stats::Distribution storeLatencyDist
Definition: shader.hh:104
packet.hh
Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:133
Shader::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:135
Stats::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:331
Shader::~Shader
~Shader()
Definition: shader.cc:143
MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:85
Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:96
ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54
Shader::initToCoalesceLatency
Stats::Distribution initToCoalesceLatency
Definition: shader.hh:110
EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1005
Shader::regStats
void regStats()
Callback to set stat parameters.
Definition: shader.cc:278
Shader::vectorInstSrcOperand
Stats::Vector vectorInstSrcOperand
Definition: shader.hh:256
HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:154
Shader::Shader
Shader(const Params *p)
Definition: shader.cc:54
Stats::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:61
Shader::AccessMem
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:450
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:234
MemCmd
Definition: packet.hh:71
ArmISA::t5
Bitfield< 5 > t5
Definition: miscregs_types.hh:225
GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:252
Process::allocateMem
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:319
Shader::_activeCus
int _activeCus
Definition: shader.hh:96
Shader::sa_n
uint32_t sa_n
Definition: shader.hh:237
ArmISA::mode
Bitfield< 4, 0 > mode
Definition: miscregs_types.hh:70
isa_traits.hh
Shader::sa_val
std::vector< int * > sa_val
Definition: shader.hh:240
Shader::shaderActiveTicks
Stats::Scalar shaderActiveTicks
Statistics.
Definition: shader.hh:255
Shader::Params
ShaderParams Params
Definition: shader.hh:122
gpu_command_processor.hh
RiscvISA::x
Bitfield< 3 > x
Definition: pagetable.hh:69
X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:769
RubySystem.hh
ProbePoints::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:542
Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:142
Stats::Distribution
A simple distribution stat.
Definition: statistics.hh:2617
Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:274
ChunkGenerator::done
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Definition: chunk_generator.hh:137
Stats::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1177
SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:133
BaseTLB::Write
@ Write
Definition: tlb.hh:57
Shader::cacheBlockRoundTrip
Stats::Distribution * cacheBlockRoundTrip
Definition: shader.hh:119
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:197
Process::mmapGrowsDown
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition: process.hh:136
ArmISA::PageBytes
const Addr PageBytes
Definition: isa_traits.hh:52
Shader::sa_when
std::vector< uint64_t > sa_when
Definition: shader.hh:242
Shader::n_cu
int n_cu
Definition: shader.hh:226
Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:431
Stats::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2634
Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1107
roundUp
T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:131
Stats::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:51
Shader::cpuPointer
BaseCPU * cpuPointer
Definition: shader.hh:134
GPUDispatcher::getOutstandingWbs
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
Definition: dispatcher.cc:286
Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:257
Shader::loadLatencyDist
Stats::Distribution loadLatencyDist
Definition: shader.hh:103
linux.hh
ccprintf
void ccprintf(cp::Print &print)
Definition: cprintf.hh:127
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Stats::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1924
chunk_generator.hh
Shader::coalsrLineAddresses
Stats::Distribution coalsrLineAddresses
Definition: shader.hh:118
ArmISA::t4
Bitfield< 4 > t4
Definition: miscregs_types.hh:226
HSAQueueEntry::dispComplete
bool dispComplete() const
Definition: hsa_queue_entry.hh:204
ArmISA::t1
Bitfield< 1 > t1
Definition: miscregs_types.hh:229
ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition: chunk_generator.hh:55
dispatcher.hh
Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:508
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:307
Shader::rubyNetworkLatency
Stats::Distribution rubyNetworkLatency
Definition: shader.hh:111
Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:535
HSAQueueEntry::isInvStarted
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
Definition: hsa_queue_entry.hh:325
Shader::dispatchWorkgroups
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition: shader.cc:235
Shader::_dispatcher
GPUDispatcher & _dispatcher
Definition: shader.hh:250
Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:525
Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:591
HSAQueueEntry::markWgDispatch
void markWgDispatch()
Definition: hsa_queue_entry.hh:261
HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:224
Process::memState
std::shared_ptr< MemState > memState
Definition: process.hh:279
HSAQueueEntry::outstandingInvs
int outstandingInvs()
Definition: hsa_queue_entry.hh:315
GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:272
Shader::cuList
std::vector< ComputeUnit * > cuList
Definition: shader.hh:247
curTick
Tick curTick()
The current simulated tick.
Definition: core.hh:45

Generated on Wed Sep 30 2020 14:02:12 for gem5 by doxygen 1.8.17