gem5  v19.0.0.0
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
dispatcher.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  *
33  * Authors: Brad Beckmann,
34  * Marc Orr,
35  * Anthony Gutierrez
36  */
37 
38 
40 
41 #include "cpu/base.hh"
42 #include "debug/GPUDisp.hh"
43 #include "gpu-compute/cl_driver.hh"
44 #include "gpu-compute/cl_event.hh"
45 #include "gpu-compute/shader.hh"
46 #include "gpu-compute/wavefront.hh"
47 #include "mem/packet_access.hh"
48 
50 
52  : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
53  pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
54  dispatchCount(0), dispatchActive(false), cpu(p->cpu),
55  shader(p->shader_pointer), driver(p->cl_driver),
56  tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
57  false, Event::CPU_Tick_Pri)
58 {
59  shader->handshake(this);
60  driver->handshake(this);
61 
62  ndRange.wg_disp_rem = false;
63  ndRange.globalWgId = 0;
64 
65  schedule(&tickEvent, 0);
66 
67  // translation port for the dispatcher
68  tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
69 
71  .name(name() + ".num_kernel_launched")
72  .desc("number of kernel launched")
73  ;
74 }
75 
76 GpuDispatcher *GpuDispatcherParams::create()
77 {
78  GpuDispatcher *dispatcher = new GpuDispatcher(this);
79  GpuDispatcher::setInstance(dispatcher);
80 
82 }
83 
84 void
86 {
87  Tick event_tick = 0;
88 
89  if (ndRange.wg_disp_rem)
90  fatal("Checkpointing not supported during active workgroup execution");
91 
92  if (tickEvent.scheduled())
93  event_tick = tickEvent.when();
94 
95  SERIALIZE_SCALAR(event_tick);
96 
97 }
98 
99 void
101 {
102  Tick event_tick;
103 
104  if (tickEvent.scheduled())
106 
107  UNSERIALIZE_SCALAR(event_tick);
108 
109  if (event_tick)
110  schedule(&tickEvent, event_tick);
111 }
112 
115 {
116  AddrRangeList ranges;
117 
118  DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
119  pioAddr, pioSize);
120 
121  ranges.push_back(RangeSize(pioAddr, pioSize));
122 
123  return ranges;
124 }
125 
126 Tick
128 {
129  assert(pkt->getAddr() >= pioAddr);
130  assert(pkt->getAddr() < pioAddr + pioSize);
131 
132  int offset = pkt->getAddr() - pioAddr;
133  pkt->allocate();
134 
135  DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
136 
137  if (offset < 8) {
138  assert(!offset);
139  assert(pkt->getSize() == 8);
140 
141  uint64_t retval = dispatchActive;
142  pkt->setLE(retval);
143  } else {
144  offset -= 8;
145  assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
146  char *curTaskPtr = (char*)&curTask;
147 
148  memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
149  }
150 
151  pkt->makeAtomicResponse();
152 
153  return pioDelay;
154 }
155 
156 Tick
158 {
159  assert(pkt->getAddr() >= pioAddr);
160  assert(pkt->getAddr() < pioAddr + pioSize);
161 
162  int offset = pkt->getAddr() - pioAddr;
163 
164 #if TRACING_ON
165  uint64_t data_val = 0;
166 
167  switch (pkt->getSize()) {
168  case 1:
169  data_val = pkt->getLE<uint8_t>();
170  break;
171  case 2:
172  data_val = pkt->getLE<uint16_t>();
173  break;
174  case 4:
175  data_val = pkt->getLE<uint32_t>();
176  break;
177  case 8:
178  data_val = pkt->getLE<uint64_t>();
179  break;
180  default:
181  DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
182  }
183 
184  DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
185  pkt->getSize());
186 #endif
187  if (!offset) {
188  static int nextId = 0;
189 
190  // The depends field of the qstruct, which was previously unused, is
191  // used to communicate with simulated application.
192  if (curTask.depends) {
193  HostState hs;
194  shader->ReadMem((uint64_t)(curTask.depends), &hs,
195  sizeof(HostState), 0);
196 
197  // update event start time (in nano-seconds)
198  uint64_t start = curTick() / 1000;
199 
200  shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
201  &start, sizeof(uint64_t), 0);
202  }
203 
204  // launch kernel
206 
207  NDRange *ndr = &(ndRangeMap[nextId]);
208  // copy dispatch info
209  ndr->q = curTask;
210 
211  // update the numDispTask polled by the runtime
212  accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
213 
214  ndr->numWgTotal = 1;
215 
216  for (int i = 0; i < 3; ++i) {
217  ndr->wgId[i] = 0;
218  ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
219  ndr->numWgTotal *= ndr->numWg[i];
220  }
221 
222  ndr->numWgCompleted = 0;
223  ndr->globalWgId = 0;
224  ndr->wg_disp_rem = true;
225  ndr->execDone = false;
226  ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
227  ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
228  ndr->dispatchId = nextId;
229  ndr->curCid = pkt->req->contextId();
230  DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
231  execIds.push(nextId);
232  ++nextId;
233 
234  dispatchActive = true;
235 
236  if (!tickEvent.scheduled()) {
238  }
239  } else {
240  // populate current task struct
241  // first 64 bits are launch reg
242  offset -= 8;
243  assert(offset < sizeof(HsaQueueEntry));
244  char *curTaskPtr = (char*)&curTask;
245  memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
246  }
247 
248  pkt->makeAtomicResponse();
249 
250  return pioDelay;
251 }
252 
253 
254 Port &
255 GpuDispatcher::getPort(const std::string &if_name, PortID idx)
256 {
257  if (if_name == "translation_port") {
258  return *tlbPort;
259  }
260 
261  return DmaDevice::getPort(if_name, idx);
262 }
263 
264 void
266 {
267  int fail_count = 0;
268 
269  // There are potentially multiple outstanding kernel launches.
270  // It is possible that the workgroups in a different kernel
271  // can fit on the GPU even if another kernel's workgroups cannot
272  DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
273 
274  while (execIds.size() > fail_count) {
275  int execId = execIds.front();
276 
277  while (ndRangeMap[execId].wg_disp_rem) {
278  //update the thread context
279  shader->updateContext(ndRangeMap[execId].curCid);
280 
281  // attempt to dispatch_workgroup
282  if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
283  // if we failed try the next kernel,
284  // it may have smaller workgroups.
285  // put it on the queue to rety latter
286  DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
287  execIds.push(execId);
288  ++fail_count;
289  break;
290  }
291  }
292  // let's try the next kernel_id
293  execIds.pop();
294  }
295 
296  DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
297 
298  if (doneIds.size() && cpu) {
300  }
301 
302  while (doneIds.size()) {
303  // wakeup the CPU if any Kernels completed this cycle
304  DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
305  doneIds.pop();
306  }
307 }
308 
309 void
311 {
312  int kern_id = w->kernId;
313  DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
314  assert(ndRangeMap[kern_id].dispatchId == kern_id);
315  ndRangeMap[kern_id].numWgCompleted++;
316 
317  if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
318  ndRangeMap[kern_id].execDone = true;
319  doneIds.push(kern_id);
320 
321  if (ndRangeMap[kern_id].addrToNotify) {
322  accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
323  0);
324  }
325 
326  accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
327 
328  // update event end time (in nano-seconds)
329  if (ndRangeMap[kern_id].q.depends) {
330  HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
331  uint64_t event;
332  shader->ReadMem((uint64_t)(&host_state->event), &event,
333  sizeof(uint64_t), 0);
334 
335  uint64_t end = curTick() / 1000;
336 
337  shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
338  sizeof(uint64_t), 0);
339  }
340  }
341 
342  if (!tickEvent.scheduled()) {
344  }
345 }
346 
347 void
349 {
350  if (!tickEvent.scheduled())
352 }
353 
354 void
356 {
357  if (cpu) {
358  if (off) {
359  shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
360  true);
361  val += off;
362  }
363 
364  shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
365  } else {
366  panic("Cannot find host");
367  }
368 }
369 
370 // helper functions for driver to retrieve GPU attributes
371 int
373 {
374  return shader->cuList.size();
375 }
376 
377 int
379 {
380  return shader->cuList[0]->wfSize();
381 }
382 
383 void
385 {
386  shader->funcargs_size = funcargs_size;
387 }
388 
389 uint32_t
391 {
392  return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
393 }
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:167
#define DPRINTF(x,...)
Definition: trace.hh:229
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:334
AddrRange RangeSize(Addr start, Addr size)
Definition: addr_range.hh:584
virtual void serialize(CheckpointOut &cp) const override
Serialize an object.
Definition: dispatcher.cc:85
Ports are used to interface objects to each other.
Definition: port.hh:60
Tick write(PacketPtr pkt) override
Pure virtual function that the device must implement.
Definition: dispatcher.cc:157
std::vector< ComputeUnit * > cuList
Definition: shader.hh:149
uint64_t event
Definition: qstruct.hh:104
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:175
BaseCPU * cpu
Definition: dispatcher.hh:84
Bitfield< 7 > i
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a wavefront.
Definition: dispatcher.cc:390
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:401
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:162
std::queue< int > execIds
Definition: dispatcher.hh:76
void updateContext(int cid)
Definition: shader.cc:124
ip6_addr_t addr
Definition: inet.hh:335
void handshake(GpuDispatcher *_dispatcher)
Definition: cl_driver.cc:89
void setFuncargsSize(int funcargs_size)
Definition: dispatcher.cc:384
Bitfield< 23, 0 > offset
Definition: types.hh:154
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
Definition: dispatcher.cc:355
int kernId
Definition: wavefront.hh:163
uint64_t addrToNotify
Definition: qstruct.hh:80
uint64_t depends
Definition: qstruct.hh:77
Definition: cprintf.cc:42
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1090
int wfSize() const
Definition: dispatcher.cc:378
void deschedule(Event &event)
Definition: eventq.hh:750
Bitfield< 63 > val
Definition: misc.hh:771
void setLE(T v)
Set the value in the data pointer to v as little endian.
RequestPtr req
A pointer to the original request.
Definition: packet.hh:327
bool execDone
Definition: ndrange.hh:62
unsigned getSize() const
Definition: packet.hh:736
int curCid
Definition: ndrange.hh:67
#define UNSERIALIZE_SCALAR(scalar)
Definition: serialize.hh:645
HsaQueueEntry curTask
Definition: dispatcher.hh:70
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:365
Tick curTick()
The current simulated tick.
Definition: core.hh:47
int numWgCompleted
Definition: ndrange.hh:55
int funcargs_size
Definition: shader.hh:133
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:162
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:385
void notifyWgCompl(Wavefront *w)
Definition: dispatcher.cc:310
ClDriver * driver
Definition: dispatcher.hh:86
void makeAtomicResponse()
Definition: packet.hh:949
DmaDeviceParams Params
Definition: dma_device.hh:175
uint64_t Tick
Tick count type.
Definition: types.hh:63
virtual void unserialize(CheckpointIn &cp) override
Unserialize an object.
Definition: dispatcher.cc:100
Bitfield< 27 > q
int numWg[3]
Definition: ndrange.hh:50
volatile uint32_t * numDispLeft
Definition: ndrange.hh:65
Addr getAddr() const
Definition: packet.hh:726
static void setInstance(GpuDispatcher *_instance)
Definition: dispatcher.hh:116
uint32_t wgSize[3]
Definition: qstruct.hh:59
uint32_t gdSize[3]
Definition: qstruct.hh:57
std::queue< int > doneIds
Definition: dispatcher.hh:78
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: dma_device.cc:282
Bitfield< 0 > w
Stats::Scalar num_kernelLaunched
Definition: dispatcher.hh:100
virtual const std::string name() const
Definition: sim_object.hh:120
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:255
Bitfield< 10, 5 > event
static GpuDispatcher * getInstance()
Definition: dispatcher.hh:121
bool wg_disp_rem
Definition: ndrange.hh:60
Bitfield< 15 > system
Definition: misc.hh:999
#define SERIALIZE_SCALAR(scalar)
Definition: serialize.hh:643
uint32_t globalWgId
Definition: ndrange.hh:57
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:279
Shader * shader
Definition: dispatcher.hh:85
EventFunctionWrapper tickEvent
Definition: dispatcher.hh:87
int numWgTotal
Definition: ndrange.hh:52
std::ostream CheckpointOut
Definition: serialize.hh:68
void scheduleDispatch()
Definition: dispatcher.cc:348
Tick ticks(int numCycles) const
Definition: shader.hh:91
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
Definition: dispatcher.cc:114
volatile bool * addrToNotify
Definition: ndrange.hh:64
T divCeil(const T &a, const U &b)
Definition: intmath.hh:153
bool dispatchActive
Definition: dispatcher.hh:82
void schedule(Event &event, Tick when)
Definition: eventq.hh:744
HsaQueueEntry q
Definition: ndrange.hh:45
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
TLBPort * tlbPort
Definition: dispatcher.hh:141
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:312
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:237
Tick read(PacketPtr pkt) override
Pure virtual function that the device must implement.
Definition: dispatcher.cc:127
std::unordered_map< int, NDRange > ndRangeMap
Definition: dispatcher.hh:72
uint64_t numDispLeft
Definition: qstruct.hh:82
static GpuDispatcher * instance
Definition: dispatcher.hh:90
NDRange ndRange
Definition: dispatcher.hh:73
bool dispatch_workgroups(NDRange *ndr)
Definition: shader.cc:174
Bitfield< 0 > p
GpuDispatcher(const Params *p)
Definition: dispatcher.cc:51
int wgId[3]
Definition: ndrange.hh:48
void hostWakeUp(BaseCPU *cpu)
Definition: shader.cc:132
void allocate()
Allocate memory for the packet.
Definition: packet.hh:1232
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:352
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: dispatcher.cc:255
void handshake(GpuDispatcher *dispatcher)
Definition: shader.cc:223
int dispatchId
Definition: ndrange.hh:66

Generated on Fri Feb 28 2020 16:27:01 for gem5 by doxygen 1.8.13