gem5  v21.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
shader.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "gpu-compute/shader.hh"
35 
36 #include <limits>
37 
38 #include "arch/x86/isa_traits.hh"
39 #include "arch/x86/linux/linux.hh"
40 #include "base/chunk_generator.hh"
41 #include "debug/GPUAgentDisp.hh"
42 #include "debug/GPUDisp.hh"
43 #include "debug/GPUMem.hh"
44 #include "debug/GPUShader.hh"
45 #include "debug/GPUWgLatency.hh"
50 #include "gpu-compute/wavefront.hh"
51 #include "mem/packet.hh"
53 #include "sim/sim_exit.hh"
54 
56  _activeCus(0), _lastInactiveTick(0), cpuThread(nullptr),
57  gpuTc(nullptr), cpuPointer(p.cpu_pointer),
58  tickEvent([this]{ execScheduledAdds(); }, "Shader scheduled adds event",
59  false, Event::CPU_Tick_Pri),
60  timingSim(p.timing), hsail_mode(SIMT),
61  impl_kern_launch_acq(p.impl_kern_launch_acq),
62  impl_kern_end_rel(p.impl_kern_end_rel),
63  coissue_return(1),
64  trace_vgpr_all(1), n_cu((p.CUs).size()), n_wf(p.n_wf),
65  globalMemSize(p.globalmem),
66  nextSchedCu(0), sa_n(0), gpuCmdProc(*p.gpu_cmd_proc),
67  _dispatcher(*p.dispatcher),
68  max_valu_insts(p.max_valu_insts), total_valu_insts(0),
69  stats(this, p.CUs[0]->wfSize())
70 {
71  gpuCmdProc.setShader(this);
72  _dispatcher.setShader(this);
73 
74  _gpuVmApe.base = ((Addr)1 << 61) + 0x1000000000000L;
75  _gpuVmApe.limit = (_gpuVmApe.base & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
76 
77  _ldsApe.base = ((Addr)1 << 61) + 0x0;
78  _ldsApe.limit = (_ldsApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
79 
80  _scratchApe.base = ((Addr)1 << 61) + 0x100000000L;
81  _scratchApe.limit = (_scratchApe.base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
82 
83  shHiddenPrivateBaseVmid = 0;
84 
85  cuList.resize(n_cu);
86 
87  panic_if(n_wf <= 0, "Must have at least 1 WF Slot per SIMD");
88 
89  for (int i = 0; i < n_cu; ++i) {
90  cuList[i] = p.CUs[i];
91  assert(i == cuList[i]->cu_id);
92  cuList[i]->shader = this;
93  cuList[i]->idleCUTimeout = p.idlecu_timeout;
94  }
95 }
96 
99 {
100  return _dispatcher;
101 }
102 
103 Addr
104 Shader::mmap(int length)
105 {
106 
107  Addr start;
108 
109  // round up length to the next page
110  length = roundUp(length, X86ISA::PageBytes);
111 
112  Process *proc = gpuTc->getProcessPtr();
113  auto mem_state = proc->memState;
114 
115  if (proc->mmapGrowsDown()) {
116  DPRINTF(GPUShader, "GROWS DOWN");
117  start = mem_state->getMmapEnd() - length;
118  mem_state->setMmapEnd(start);
119  } else {
120  DPRINTF(GPUShader, "GROWS UP");
121  start = mem_state->getMmapEnd();
122  mem_state->setMmapEnd(start + length);
123 
124  // assertion to make sure we don't overwrite the stack (it grows down)
125  assert(mem_state->getStackBase() - mem_state->getMaxStackSize() >
126  mem_state->getMmapEnd());
127  }
128 
129  DPRINTF(GPUShader, "Shader::mmap start= %#x, %#x\n", start, length);
130 
131  proc->allocateMem(start, length);
132 
133  return start;
134 }
135 
136 void
138 {
139  // grab the threadContext of the thread running on the CPU
140  assert(cpuPointer);
142  assert(gpuTc);
143 }
144 
146 {
147  for (int j = 0; j < n_cu; ++j)
148  delete cuList[j];
149 }
150 
151 void
153  // context of the thread which dispatched work
154  assert(cpuPointer);
155  gpuTc = cpuPointer->getContext(cid);
156  assert(gpuTc);
157 }
158 
159 void
161 {
162  assert(!sa_when.empty());
163 
164  // apply any scheduled adds
165  for (int i = 0; i < sa_n; ++i) {
166  if (sa_when[i] <= curTick()) {
167  *sa_val[i] += sa_x[i];
168  panic_if(*sa_val[i] < 0, "Negative counter value\n");
169  sa_val.erase(sa_val.begin() + i);
170  sa_x.erase(sa_x.begin() + i);
171  sa_when.erase(sa_when.begin() + i);
172  --sa_n;
173  --i;
174  }
175  }
176  if (!sa_when.empty()) {
177  Tick shader_wakeup = *std::max_element(sa_when.begin(),
178  sa_when.end());
179  DPRINTF(GPUDisp, "Scheduling shader wakeup at %lu\n", shader_wakeup);
180  schedule(tickEvent, shader_wakeup);
181  } else {
182  DPRINTF(GPUDisp, "sa_when empty, shader going to sleep!\n");
183  }
184 }
185 
186 /*
187  * dispatcher/shader arranges invalidate requests to the CUs
188  */
189 void
191  // if invalidate has already started/finished, then do nothing
192  if (task->isInvStarted()) return;
193 
194  // invalidate has never started; it can only perform once at kernel launch
195  assert(task->outstandingInvs() == -1);
196  int kernId = task->dispatchId();
197  // counter value is 0 now, indicating the inv is about to start
198  _dispatcher.updateInvCounter(kernId, +1);
199 
200  // iterate all cus managed by the shader, to perform invalidate.
201  for (int i_cu = 0; i_cu < n_cu; ++i_cu) {
202  // create a request to hold INV info; the request's fields will
203  // be updated in cu before use
204  auto req = std::make_shared<Request>(0, 0, 0,
205  cuList[i_cu]->requestorId(),
206  0, -1);
207 
208  _dispatcher.updateInvCounter(kernId, +1);
209  // all necessary INV flags are all set now, call cu to execute
210  cuList[i_cu]->doInvalidate(req, task->dispatchId());
211 
212  // I don't like this. This is intrusive coding.
213  cuList[i_cu]->resetRegisterPool();
214  }
215 }
216 
220 void
222  int kernId = gpuDynInst->kern_id;
223  // flush has never been started, performed only once at kernel end
224  assert(_dispatcher.getOutstandingWbs(kernId) == 0);
225 
226  // the first cu, managed by the shader, performs flush operation,
227  // assuming that L2 cache is shared by all cus in the shader
228  int i_cu = 0;
229  _dispatcher.updateWbCounter(kernId, +1);
230  cuList[i_cu]->doFlush(gpuDynInst);
231 }
232 
233 bool
235 {
236  bool scheduledSomething = false;
237  int cuCount = 0;
238  int curCu = nextSchedCu;
239  int disp_count(0);
240 
241  while (cuCount < n_cu) {
242  //Every time we try a CU, update nextSchedCu
243  nextSchedCu = (nextSchedCu + 1) % n_cu;
244 
245  // dispatch workgroup iff the following two conditions are met:
246  // (a) wg_rem is true - there are unassigned workgroups in the grid
247  // (b) there are enough free slots in cu cuList[i] for this wg
248  int num_wfs_in_wg = 0;
249  bool can_disp = cuList[curCu]->hasDispResources(task, num_wfs_in_wg);
250  if (!task->dispComplete() && can_disp) {
251  scheduledSomething = true;
252  DPRINTF(GPUDisp, "Dispatching a workgroup to CU %d: WG %d\n",
253  curCu, task->globalWgId());
254  DPRINTF(GPUAgentDisp, "Dispatching a workgroup to CU %d: WG %d\n",
255  curCu, task->globalWgId());
256  DPRINTF(GPUWgLatency, "WG Begin cycle:%d wg:%d cu:%d\n",
257  curTick(), task->globalWgId(), curCu);
258 
259  if (!cuList[curCu]->tickEvent.scheduled()) {
260  if (!_activeCus)
262  _activeCus++;
263  }
264 
265  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
266  "Invalid activeCu size\n");
267  cuList[curCu]->dispWorkgroup(task, num_wfs_in_wg);
268 
269  task->markWgDispatch();
270  ++disp_count;
271  }
272 
273  ++cuCount;
274  curCu = nextSchedCu;
275  }
276 
277  DPRINTF(GPUWgLatency, "Shader Dispatched %d Wgs\n", disp_count);
278 
279  return scheduledSomething;
280 }
281 
282 void
284  bool suppress_func_errors, int cu_id)
285 {
286  int block_size = cuList.at(cu_id)->cacheLineSize();
287  unsigned size = req->getSize();
288 
289  Addr tmp_addr;
290  BaseTLB::Mode trans_mode;
291 
292  if (cmd == MemCmd::ReadReq) {
293  trans_mode = BaseTLB::Read;
294  } else if (cmd == MemCmd::WriteReq) {
295  trans_mode = BaseTLB::Write;
296  } else {
297  fatal("unexcepted MemCmd\n");
298  }
299 
300  tmp_addr = req->getVaddr();
301  Addr split_addr = roundDown(tmp_addr + size - 1, block_size);
302 
303  assert(split_addr <= tmp_addr || split_addr - tmp_addr < block_size);
304 
305  // Misaligned access
306  if (split_addr > tmp_addr) {
307  RequestPtr req1, req2;
308  req->splitOnVaddr(split_addr, req1, req2);
309 
310  PacketPtr pkt1 = new Packet(req2, cmd);
311  PacketPtr pkt2 = new Packet(req1, cmd);
312 
313  functionalTLBAccess(pkt1, cu_id, trans_mode);
314  functionalTLBAccess(pkt2, cu_id, trans_mode);
315 
316  PacketPtr new_pkt1 = new Packet(pkt1->req, cmd);
317  PacketPtr new_pkt2 = new Packet(pkt2->req, cmd);
318 
319  new_pkt1->dataStatic(data);
320  new_pkt2->dataStatic((uint8_t*)data + req1->getSize());
321 
322  if (suppress_func_errors) {
323  new_pkt1->setSuppressFuncError();
324  new_pkt2->setSuppressFuncError();
325  }
326 
327  // fixme: this should be cuList[cu_id] if cu_id != n_cu
328  // The latter requires a memPort in the dispatcher
329  cuList[0]->memPort[0].sendFunctional(new_pkt1);
330  cuList[0]->memPort[0].sendFunctional(new_pkt2);
331 
332  delete new_pkt1;
333  delete new_pkt2;
334  delete pkt1;
335  delete pkt2;
336  } else {
337  PacketPtr pkt = new Packet(req, cmd);
338  functionalTLBAccess(pkt, cu_id, trans_mode);
339  PacketPtr new_pkt = new Packet(pkt->req, cmd);
340  new_pkt->dataStatic(data);
341 
342  if (suppress_func_errors) {
343  new_pkt->setSuppressFuncError();
344  };
345 
346  // fixme: this should be cuList[cu_id] if cu_id != n_cu
347  // The latter requires a memPort in the dispatcher
348  cuList[0]->memPort[0].sendFunctional(new_pkt);
349 
350  delete new_pkt;
351  delete pkt;
352  }
353 }
354 
355 void
357 {
358  sa_val.push_back(val);
359  when += curTick();
360  sa_when.push_back(when);
361  sa_x.push_back(x);
362  ++sa_n;
363  if (!tickEvent.scheduled() || (when < tickEvent.when())) {
364  DPRINTF(GPUDisp, "New scheduled add; scheduling shader wakeup at "
365  "%lu\n", when);
366  reschedule(tickEvent, when, true);
367  } else {
368  assert(tickEvent.scheduled());
369  DPRINTF(GPUDisp, "New scheduled add; wakeup already scheduled at "
370  "%lu\n", when);
371  }
372 }
373 
374 void
375 Shader::AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
376  MemCmd cmd, bool suppress_func_errors)
377 {
378  uint8_t *data_buf = (uint8_t*)ptr;
379 
380  for (ChunkGenerator gen(address, size, cuList.at(cu_id)->cacheLineSize());
381  !gen.done(); gen.next()) {
382 
383  RequestPtr req = std::make_shared<Request>(
384  gen.addr(), gen.size(), 0,
385  cuList[0]->requestorId(), 0, 0, nullptr);
386 
387  doFunctionalAccess(req, cmd, data_buf, suppress_func_errors, cu_id);
388  data_buf += gen.size();
389  }
390 }
391 
392 void
393 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id)
394 {
395  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq, false);
396 }
397 
398 void
399 Shader::ReadMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
400  bool suppress_func_errors)
401 {
402  AccessMem(address, ptr, size, cu_id, MemCmd::ReadReq,
403  suppress_func_errors);
404 }
405 
406 void
407 Shader::WriteMem(uint64_t address, void *ptr,uint32_t size, int cu_id)
408 {
409  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq, false);
410 }
411 
412 void
413 Shader::WriteMem(uint64_t address, void *ptr, uint32_t size, int cu_id,
414  bool suppress_func_errors)
415 {
416  AccessMem(address, ptr, size, cu_id, MemCmd::WriteReq,
417  suppress_func_errors);
418 }
419 
420 /*
421  * Send a packet through the appropriate TLB functional port.
422  * If cu_id=n_cu, then this is the dispatcher's TLB.
423  * Otherwise it's the TLB of the cu_id compute unit.
424  */
425 void
427 {
428  // update senderState. Need to know the gpuTc and the TLB mode
429  pkt->senderState =
430  new TheISA::GpuTLB::TranslationState(mode, gpuTc, false);
431 
432  // even when the perLaneTLB flag is turned on
433  // it's ok tp send all accesses through lane 0
434  // since the lane # is not known here,
435  // This isn't important since these are functional accesses.
436  cuList[cu_id]->tlbPort[0].sendFunctional(pkt);
437 
438  /* safe_cast the senderState */
439  TheISA::GpuTLB::TranslationState *sender_state =
440  safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
441 
442  delete sender_state->tlbEntry;
443  delete pkt->senderState;
444 }
445 
446 /*
447  * allow the shader to sample stats from constituent devices
448  */
449 void
450 Shader::sampleStore(const Tick accessTime)
451 {
452  stats.storeLatencyDist.sample(accessTime);
453  stats.allLatencyDist.sample(accessTime);
454 }
455 
456 /*
457  * allow the shader to sample stats from constituent devices
458  */
459 void
460 Shader::sampleLoad(const Tick accessTime)
461 {
462  stats.loadLatencyDist.sample(accessTime);
463  stats.allLatencyDist.sample(accessTime);
464 }
465 
466 void
468 {
469  // Only sample instructions that go all the way to main memory
470  if (roundTripTime.size() != InstMemoryHop::InstMemoryHopMax) {
471  return;
472  }
473 
474  Tick t1 = roundTripTime[0];
475  Tick t2 = roundTripTime[1];
476  Tick t3 = roundTripTime[2];
477  Tick t4 = roundTripTime[3];
478  Tick t5 = roundTripTime[4];
479 
484 }
485 
486 void
488 {
489  stats.coalsrLineAddresses.sample(lineMap.size());
490  std::vector<Tick> netTimes;
491 
492  // For each cache block address generated by a vmem inst, calculate
493  // the round-trip time for that cache block.
494  for (auto& it : lineMap) {
495  const std::vector<Tick>& timeVec = it.second;
496  if (timeVec.size() == 2) {
497  netTimes.push_back(timeVec[1] - timeVec[0]);
498  }
499  }
500 
501  // Sort the cache block round trip times so that the first
502  // distrubtion is always measuring the fastests and the last
503  // distrubtion is always measuring the slowest cache block.
504  std::sort(netTimes.begin(), netTimes.end());
505 
506  // Sample the round trip time for each N cache blocks into the
507  // Nth distribution.
508  int idx = 0;
509  for (auto& time : netTimes) {
510  stats.cacheBlockRoundTrip[idx].sample(time);
511  ++idx;
512  }
513 }
514 
515 void
517  // If all CUs attached to his shader are asleep, update shaderActiveTicks
518  panic_if(_activeCus <= 0 || _activeCus > cuList.size(),
519  "Invalid activeCu size\n");
520  _activeCus--;
521  if (!_activeCus)
523 }
524 
526  : Stats::Group(parent),
527  ADD_STAT(allLatencyDist, "delay distribution for all"),
528  ADD_STAT(loadLatencyDist, "delay distribution for loads"),
529  ADD_STAT(storeLatencyDist, "delay distribution for stores"),
530  ADD_STAT(initToCoalesceLatency,
531  "Ticks from vmem inst initiateAcc to coalescer issue"),
532  ADD_STAT(rubyNetworkLatency,
533  "Ticks from coalescer issue to coalescer hit callback"),
534  ADD_STAT(gmEnqueueLatency,
535  "Ticks from coalescer hit callback to GM pipe enqueue"),
536  ADD_STAT(gmToCompleteLatency,
537  "Ticks queued in GM pipes ordered response buffer"),
538  ADD_STAT(coalsrLineAddresses,
539  "Number of cache lines for coalesced request"),
540  ADD_STAT(shaderActiveTicks,
541  "Total ticks that any CU attached to this shader is active"),
542  ADD_STAT(vectorInstSrcOperand,
543  "vector instruction source operand distribution"),
544  ADD_STAT(vectorInstDstOperand,
545  "vector instruction destination operand distribution")
546 {
548  .init(0, 1600000, 10000)
550 
552  .init(0, 1600000, 10000)
554 
556  .init(0, 1600000, 10000)
558 
560  .init(0, 1600000, 10000)
562 
564  .init(0, 1600000, 10000)
566 
568  .init(0, 1600000, 10000)
570 
572  .init(0, 1600000, 10000)
574 
576  .init(0, 20, 1)
578 
581 
583  for (int idx = 0; idx < wf_size; ++idx) {
584  std::stringstream namestr;
585  ccprintf(namestr, "%s.cacheBlockRoundTrip%d",
586  static_cast<Shader*>(parent)->name(), idx);
588  .init(0, 1600000, 10000)
589  .name(namestr.str())
590  .desc("Coalsr-to-coalsr time for the Nth cache block in an inst")
592  }
593 }
fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:183
Packet::setSuppressFuncError
void setSuppressFuncError()
Definition: packet.hh:718
hsa_queue_entry.hh
Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:462
Shader::sa_x
std::vector< int32_t > sa_x
Definition: shader.hh:224
roundDown
T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:150
Shader::stats
Shader::ShaderStats stats
X86ISA::L
Bitfield< 7, 0 > L
Definition: int.hh:57
Shader::functionalTLBAccess
void functionalTLBAccess(PacketPtr pkt, int cu_id, BaseTLB::Mode mode)
Definition: shader.cc:426
Shader::updateContext
void updateContext(int cid)
Definition: shader.cc:152
BaseTLB::Read
@ Read
Definition: tlb.hh:57
Shader
Definition: shader.hh:87
EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:201
data
const char data[]
Definition: circlebuf.test.cc:47
shader.hh
Shader::ShaderStats::initToCoalesceLatency
Stats::Distribution initToCoalesceLatency
Definition: shader.hh:300
ArmISA::t2
Bitfield< 2 > t2
Definition: miscregs_types.hh:228
Shader::tickEvent
EventFunctionWrapper tickEvent
Definition: shader.hh:191
ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63
Process
Definition: process.hh:65
EventManager::reschedule
void reschedule(Event &event, Tick when, bool always=false)
Definition: eventq.hh:1034
Shader::mmap
Addr mmap(int length)
Definition: shader.cc:104
Shader::ShaderStats::vectorInstDstOperand
Stats::Vector vectorInstDstOperand
Definition: shader.hh:320
ArmISA::t3
Bitfield< 3 > t3
Definition: miscregs_types.hh:227
Shader::ShaderStats::rubyNetworkLatency
Stats::Distribution rubyNetworkLatency
Definition: shader.hh:303
InstMemoryHopMax
@ InstMemoryHopMax
Definition: misc.hh:56
Shader::_lastInactiveTick
Tick _lastInactiveTick
Definition: shader.hh:99
Shader::prepareFlush
void prepareFlush(GPUDynInstPtr gpuDynInst)
dispatcher/shader arranges flush requests to the CUs
Definition: shader.cc:221
gpu_static_inst.hh
BaseTLB::Mode
Mode
Definition: tlb.hh:57
MemCmd::ReadReq
@ ReadReq
Definition: packet.hh:83
Tick
uint64_t Tick
Tick count type.
Definition: types.hh:59
Shader::WriteMem
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:407
Shader::nextSchedCu
int nextSchedCu
Definition: shader.hh:214
Shader::sampleLineRoundTrip
void sampleLineRoundTrip(const std::map< Addr, std::vector< Tick >> &roundTripTime)
Definition: shader.cc:487
RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:86
Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:341
std::vector< Tick >
Shader::prepareInvalidate
void prepareInvalidate(HSAQueueEntry *task)
Definition: shader.cc:190
ThreadContext::getProcessPtr
virtual Process * getProcessPtr()=0
Shader::ShaderStats::loadLatencyDist
Stats::Distribution loadLatencyDist
Definition: shader.hh:296
Shader::ShaderStats::vectorInstSrcOperand
Stats::Vector vectorInstSrcOperand
Definition: shader.hh:319
BaseCPU::getContext
virtual ThreadContext * getContext(int tn)
Given a thread num get tho thread context for it.
Definition: base.hh:300
Event::when
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:505
sim_exit.hh
HSAQueueEntry
Definition: hsa_queue_entry.hh:58
Shader::ReadMem
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:393
Shader::doFunctionalAccess
void doFunctionalAccess(const RequestPtr &req, MemCmd cmd, void *data, bool suppress_func_errors, int cu_id)
Definition: shader.cc:283
Shader::execScheduledAdds
void execScheduledAdds()
Definition: shader.cc:160
GPUDispatcher
Definition: dispatcher.hh:61
wavefront.hh
ClockedObject
The ClockedObject class extends the SimObject with a clock and accessor functions to relate ticks to ...
Definition: clocked_object.hh:231
packet.hh
Shader::gpuTc
ThreadContext * gpuTc
Definition: shader.hh:113
Shader::init
virtual void init()
init() is called after all C++ SimObjects have been created and all ports are connected.
Definition: shader.cc:137
Stats::DataWrap::flags
Derived & flags(Flags _flags)
Set the flags and marks this stat to print at the end of simulation.
Definition: statistics.hh:339
Shader::~Shader
~Shader()
Definition: shader.cc:145
Shader::ShaderStats::shaderActiveTicks
Stats::Scalar shaderActiveTicks
Definition: shader.hh:318
MemCmd::WriteReq
@ WriteReq
Definition: packet.hh:86
Shader::dispatcher
GPUDispatcher & dispatcher()
Definition: shader.cc:98
ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54
EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:1016
HSAQueueEntry::dispatchId
int dispatchId() const
Definition: hsa_queue_entry.hh:152
Stats::oneline
const FlagsType oneline
Print all values on a single line.
Definition: info.hh:62
Shader::AccessMem
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:375
DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:237
ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition: group.hh:71
MemCmd
Definition: packet.hh:72
ArmISA::t5
Bitfield< 5 > t5
Definition: miscregs_types.hh:225
Shader::ShaderStats::coalsrLineAddresses
Stats::Distribution coalsrLineAddresses
Definition: shader.hh:312
GPUDispatcher::updateInvCounter
void updateInvCounter(int kern_id, int val=-1)
update the counter of oustanding inv requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:245
Process::allocateMem
void allocateMem(Addr vaddr, int64_t size, bool clobber=false)
Definition: process.cc:306
Shader::_activeCus
int _activeCus
Definition: shader.hh:96
Shader::sa_n
uint32_t sa_n
Definition: shader.hh:217
X86ISA::PageBytes
const Addr PageBytes
Definition: isa_traits.hh:49
ArmISA::mode
Bitfield< 4, 0 > mode
Definition: miscregs_types.hh:70
isa_traits.hh
Shader::sa_val
std::vector< int * > sa_val
Definition: shader.hh:220
Shader::ShaderStats::allLatencyDist
Stats::Distribution allLatencyDist
Definition: shader.hh:295
Shader::Params
ShaderParams Params
Definition: shader.hh:102
Shader::ShaderStats::cacheBlockRoundTrip
Stats::Distribution * cacheBlockRoundTrip
Definition: shader.hh:316
gpu_command_processor.hh
RiscvISA::x
Bitfield< 3 > x
Definition: pagetable.hh:70
X86ISA::val
Bitfield< 63 > val
Definition: misc.hh:769
RubySystem.hh
ProbePoints::Packet
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:103
Shader::sampleInstRoundTrip
void sampleInstRoundTrip(std::vector< Tick > roundTripTime)
Definition: shader.cc:467
Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:148
Stats::Distribution
A simple distribution stat.
Definition: statistics.hh:2084
Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:270
ChunkGenerator::done
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
Definition: chunk_generator.hh:138
Stats::VectorBase::init
Derived & init(size_type size)
Set this vector to have the given size.
Definition: statistics.hh:1028
SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:182
BaseTLB::Write
@ Write
Definition: tlb.hh:57
panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:197
Process::mmapGrowsDown
virtual bool mmapGrowsDown() const
Does mmap region grow upward or downward from mmapEnd? Most platforms grow downward,...
Definition: process.hh:133
Shader::sa_when
std::vector< uint64_t > sa_when
Definition: shader.hh:222
Shader::n_cu
int n_cu
Definition: shader.hh:206
Shader::ScheduleAdd
void ScheduleAdd(int *val, Tick when, int x)
Definition: shader.cc:356
Stats::Distribution::init
Distribution & init(Counter min, Counter max, Counter bkt)
Set the parameters of this distribution.
Definition: statistics.hh:2113
Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1108
Shader::ShaderStats::ShaderStats
ShaderStats(Stats::Group *parent, int wf_size)
Definition: shader.cc:525
roundUp
T roundUp(const T &val, const U &align)
This function is used to align addresses in memory.
Definition: intmath.hh:131
Stats::pdf
const FlagsType pdf
Print the percent of the total that this entry represents.
Definition: info.hh:52
Shader::cpuPointer
BaseCPU * cpuPointer
Definition: shader.hh:114
GPUDispatcher::getOutstandingWbs
int getOutstandingWbs(int kern_id)
get kernel's outstanding cache writeback requests
Definition: dispatcher.cc:279
Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:258
Stats::Group
Statistics container.
Definition: group.hh:87
linux.hh
ccprintf
void ccprintf(cp::Print &print)
Definition: cprintf.hh:127
GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition: misc.hh:48
Stats::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1323
chunk_generator.hh
ArmISA::t4
Bitfield< 4 > t4
Definition: miscregs_types.hh:226
HSAQueueEntry::dispComplete
bool dispComplete() const
Definition: hsa_queue_entry.hh:202
ArmISA::t1
Bitfield< 1 > t1
Definition: miscregs_types.hh:229
Stats
Definition: statistics.cc:53
ChunkGenerator
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Definition: chunk_generator.hh:56
curTick
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:43
dispatcher.hh
Packet::senderState
SenderState * senderState
This packet's sender state.
Definition: packet.hh:509
MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323
Shader::ShaderStats::gmEnqueueLatency
Stats::Distribution gmEnqueueLatency
Definition: shader.hh:306
Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:315
Shader::sampleLoad
void sampleLoad(const Tick accessTime)
Definition: shader.cc:460
HSAQueueEntry::isInvStarted
bool isInvStarted()
Whether invalidate has started or finished -1 is the initial value indicating inv has not started for...
Definition: hsa_queue_entry.hh:323
Shader::dispatchWorkgroups
bool dispatchWorkgroups(HSAQueueEntry *task)
Definition: shader.cc:234
Shader::ShaderStats::gmToCompleteLatency
Stats::Distribution gmToCompleteLatency
Definition: shader.hh:309
Shader::_dispatcher
GPUDispatcher & _dispatcher
Definition: shader.hh:230
Shader::sampleStore
void sampleStore(const Tick accessTime)
Definition: shader.cc:450
Shader::notifyCuSleep
void notifyCuSleep()
Definition: shader.cc:516
HSAQueueEntry::markWgDispatch
void markWgDispatch()
Definition: hsa_queue_entry.hh:259
HSAQueueEntry::globalWgId
int globalWgId() const
Definition: hsa_queue_entry.hh:222
Process::memState
std::shared_ptr< MemState > memState
Definition: process.hh:274
HSAQueueEntry::outstandingInvs
int outstandingInvs()
Definition: hsa_queue_entry.hh:313
GPUDispatcher::updateWbCounter
bool updateWbCounter(int kern_id, int val=-1)
update the counter of oustanding wb requests for the kernel kern_id: kernel id val: +1/-1,...
Definition: dispatcher.cc:265
Shader::cuList
std::vector< ComputeUnit * > cuList
Definition: shader.hh:227
Shader::Shader
Shader(const Params &p)
Definition: shader.cc:55
Shader::ShaderStats::storeLatencyDist
Stats::Distribution storeLatencyDist
Definition: shader.hh:297

Generated on Tue Mar 23 2021 19:41:27 for gem5 by doxygen 1.8.17