gem5  v20.0.0.3
dispatcher.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * For use for simulation and test purposes only
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice,
11  * this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  * this list of conditions and the following disclaimer in the documentation
15  * and/or other materials provided with the distribution.
16  *
17  * 3. Neither the name of the copyright holder nor the names of its
18  * contributors may be used to endorse or promote products derived from this
19  * software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 
36 
37 #include "cpu/base.hh"
38 #include "debug/GPUDisp.hh"
39 #include "gpu-compute/cl_driver.hh"
40 #include "gpu-compute/cl_event.hh"
41 #include "gpu-compute/shader.hh"
42 #include "gpu-compute/wavefront.hh"
43 #include "mem/packet_access.hh"
44 
46 
48  : DmaDevice(p), _masterId(p->system->getMasterId(this, "disp")),
49  pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
50  dispatchCount(0), dispatchActive(false), cpu(p->cpu),
51  shader(p->shader_pointer), driver(p->cl_driver),
52  tickEvent([this]{ exec(); }, "GPU Dispatcher tick",
53  false, Event::CPU_Tick_Pri)
54 {
55  shader->handshake(this);
56  driver->handshake(this);
57 
58  ndRange.wg_disp_rem = false;
59  ndRange.globalWgId = 0;
60 
61  schedule(&tickEvent, 0);
62 
63  // translation port for the dispatcher
64  tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);
65 
67  .name(name() + ".num_kernel_launched")
68  .desc("number of kernel launched")
69  ;
70 }
71 
72 GpuDispatcher *GpuDispatcherParams::create()
73 {
74  GpuDispatcher *dispatcher = new GpuDispatcher(this);
75  GpuDispatcher::setInstance(dispatcher);
76 
78 }
79 
80 void
82 {
83  Tick event_tick = 0;
84 
85  if (ndRange.wg_disp_rem)
86  fatal("Checkpointing not supported during active workgroup execution");
87 
88  if (tickEvent.scheduled())
89  event_tick = tickEvent.when();
90 
91  SERIALIZE_SCALAR(event_tick);
92 
93 }
94 
95 void
97 {
98  Tick event_tick;
99 
100  if (tickEvent.scheduled())
102 
103  UNSERIALIZE_SCALAR(event_tick);
104 
105  if (event_tick)
106  schedule(&tickEvent, event_tick);
107 }
108 
111 {
112  AddrRangeList ranges;
113 
114  DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
115  pioAddr, pioSize);
116 
117  ranges.push_back(RangeSize(pioAddr, pioSize));
118 
119  return ranges;
120 }
121 
122 Tick
124 {
125  assert(pkt->getAddr() >= pioAddr);
126  assert(pkt->getAddr() < pioAddr + pioSize);
127 
128  int offset = pkt->getAddr() - pioAddr;
129  pkt->allocate();
130 
131  DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());
132 
133  if (offset < 8) {
134  assert(!offset);
135  assert(pkt->getSize() == 8);
136 
137  uint64_t retval = dispatchActive;
138  pkt->setLE(retval);
139  } else {
140  offset -= 8;
141  assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
142  char *curTaskPtr = (char*)&curTask;
143 
144  memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
145  }
146 
147  pkt->makeAtomicResponse();
148 
149  return pioDelay;
150 }
151 
152 Tick
154 {
155  assert(pkt->getAddr() >= pioAddr);
156  assert(pkt->getAddr() < pioAddr + pioSize);
157 
158  int offset = pkt->getAddr() - pioAddr;
159 
160 #if TRACING_ON
161  uint64_t data_val = 0;
162 
163  switch (pkt->getSize()) {
164  case 1:
165  data_val = pkt->getLE<uint8_t>();
166  break;
167  case 2:
168  data_val = pkt->getLE<uint16_t>();
169  break;
170  case 4:
171  data_val = pkt->getLE<uint32_t>();
172  break;
173  case 8:
174  data_val = pkt->getLE<uint64_t>();
175  break;
176  default:
177  DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
178  }
179 
180  DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
181  pkt->getSize());
182 #endif
183  if (!offset) {
184  static int nextId = 0;
185 
186  // The depends field of the qstruct, which was previously unused, is
187  // used to communicate with simulated application.
188  if (curTask.depends) {
189  HostState hs;
190  shader->ReadMem((uint64_t)(curTask.depends), &hs,
191  sizeof(HostState), 0);
192 
193  // update event start time (in nano-seconds)
194  uint64_t start = curTick() / 1000;
195 
196  shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
197  &start, sizeof(uint64_t), 0);
198  }
199 
200  // launch kernel
202 
203  NDRange *ndr = &(ndRangeMap[nextId]);
204  // copy dispatch info
205  ndr->q = curTask;
206 
207  // update the numDispTask polled by the runtime
208  accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);
209 
210  ndr->numWgTotal = 1;
211 
212  for (int i = 0; i < 3; ++i) {
213  ndr->wgId[i] = 0;
214  ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
215  ndr->numWgTotal *= ndr->numWg[i];
216  }
217 
218  ndr->numWgCompleted = 0;
219  ndr->globalWgId = 0;
220  ndr->wg_disp_rem = true;
221  ndr->execDone = false;
222  ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
223  ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
224  ndr->dispatchId = nextId;
225  ndr->curCid = pkt->req->contextId();
226  DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
227  execIds.push(nextId);
228  ++nextId;
229 
230  dispatchActive = true;
231 
232  if (!tickEvent.scheduled()) {
234  }
235  } else {
236  // populate current task struct
237  // first 64 bits are launch reg
238  offset -= 8;
239  assert(offset < sizeof(HsaQueueEntry));
240  char *curTaskPtr = (char*)&curTask;
241  memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
242  }
243 
244  pkt->makeAtomicResponse();
245 
246  return pioDelay;
247 }
248 
249 
250 Port &
251 GpuDispatcher::getPort(const std::string &if_name, PortID idx)
252 {
253  if (if_name == "translation_port") {
254  return *tlbPort;
255  }
256 
257  return DmaDevice::getPort(if_name, idx);
258 }
259 
260 void
262 {
263  int fail_count = 0;
264 
265  // There are potentially multiple outstanding kernel launches.
266  // It is possible that the workgroups in a different kernel
267  // can fit on the GPU even if another kernel's workgroups cannot
268  DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());
269 
270  while (execIds.size() > fail_count) {
271  int execId = execIds.front();
272 
273  while (ndRangeMap[execId].wg_disp_rem) {
274  //update the thread context
275  shader->updateContext(ndRangeMap[execId].curCid);
276 
277  // attempt to dispatch_workgroup
278  if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
279  // if we failed try the next kernel,
280  // it may have smaller workgroups.
281  // put it on the queue to rety latter
282  DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
283  execIds.push(execId);
284  ++fail_count;
285  break;
286  }
287  }
288  // let's try the next kernel_id
289  execIds.pop();
290  }
291 
292  DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());
293 
294  if (doneIds.size() && cpu) {
296  }
297 
298  while (doneIds.size()) {
299  // wakeup the CPU if any Kernels completed this cycle
300  DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
301  doneIds.pop();
302  }
303 }
304 
305 void
307 {
308  int kern_id = w->kernId;
309  DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
310  assert(ndRangeMap[kern_id].dispatchId == kern_id);
311  ndRangeMap[kern_id].numWgCompleted++;
312 
313  if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
314  ndRangeMap[kern_id].execDone = true;
315  doneIds.push(kern_id);
316 
317  if (ndRangeMap[kern_id].addrToNotify) {
318  accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
319  0);
320  }
321 
322  accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);
323 
324  // update event end time (in nano-seconds)
325  if (ndRangeMap[kern_id].q.depends) {
326  HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
327  uint64_t event;
328  shader->ReadMem((uint64_t)(&host_state->event), &event,
329  sizeof(uint64_t), 0);
330 
331  uint64_t end = curTick() / 1000;
332 
333  shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
334  sizeof(uint64_t), 0);
335  }
336  }
337 
338  if (!tickEvent.scheduled()) {
340  }
341 }
342 
343 void
345 {
346  if (!tickEvent.scheduled())
348 }
349 
350 void
352 {
353  if (cpu) {
354  if (off) {
355  shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
356  true);
357  val += off;
358  }
359 
360  shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
361  } else {
362  panic("Cannot find host");
363  }
364 }
365 
366 // helper functions for driver to retrieve GPU attributes
367 int
369 {
370  return shader->cuList.size();
371 }
372 
373 int
375 {
376  return shader->cuList[0]->wfSize();
377 }
378 
379 void
381 {
382  shader->funcargs_size = funcargs_size;
383 }
384 
385 uint32_t
387 {
388  return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
389 }
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163
#define DPRINTF(x,...)
Definition: trace.hh:225
void AccessMem(uint64_t address, void *ptr, uint32_t size, int cu_id, MemCmd cmd, bool suppress_func_errors)
Definition: shader.cc:332
AddrRange RangeSize(Addr start, Addr size)
Definition: addr_range.hh:580
virtual void serialize(CheckpointOut &cp) const override
Serialize an object.
Definition: dispatcher.cc:81
Ports are used to interface objects to each other.
Definition: port.hh:56
Tick write(PacketPtr pkt) override
Pure virtual function that the device must implement.
Definition: dispatcher.cc:153
std::vector< ComputeUnit * > cuList
Definition: shader.hh:149
uint64_t event
Definition: qstruct.hh:104
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171
BaseCPU * cpu
Definition: dispatcher.hh:80
Bitfield< 7 > i
uint32_t getStaticContextSize() const
Returns the size of the static hardware context of a wavefront.
Definition: dispatcher.cc:386
std::queue< int > execIds
Definition: dispatcher.hh:72
void updateContext(int cid)
Definition: shader.cc:122
ip6_addr_t addr
Definition: inet.hh:330
void handshake(GpuDispatcher *_dispatcher)
Definition: cl_driver.cc:89
void setFuncargsSize(int funcargs_size)
Definition: dispatcher.cc:380
Bitfield< 23, 0 > offset
Definition: types.hh:152
void accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
Definition: dispatcher.cc:351
int kernId
Definition: wavefront.hh:161
uint64_t addrToNotify
Definition: qstruct.hh:80
uint64_t depends
Definition: qstruct.hh:77
Definition: cprintf.cc:40
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1084
int wfSize() const
Definition: dispatcher.cc:374
Bitfield< 63 > val
Definition: misc.hh:769
void setLE(T v)
Set the value in the data pointer to v as little endian.
RequestPtr req
A pointer to the original request.
Definition: packet.hh:321
bool execDone
Definition: ndrange.hh:62
unsigned getSize() const
Definition: packet.hh:730
int curCid
Definition: ndrange.hh:67
#define UNSERIALIZE_SCALAR(scalar)
Definition: serialize.hh:770
HsaQueueEntry curTask
Definition: dispatcher.hh:66
void WriteMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:363
Tick curTick()
The current simulated tick.
Definition: core.hh:44
int numWgCompleted
Definition: ndrange.hh:55
int funcargs_size
Definition: shader.hh:133
std::string csprintf(const char *format, const Args &...args)
Definition: cprintf.hh:158
void notifyWgCompl(Wavefront *w)
Definition: dispatcher.cc:306
ClDriver * driver
Definition: dispatcher.hh:82
void makeAtomicResponse()
Definition: packet.hh:943
DmaDeviceParams Params
Definition: dma_device.hh:171
uint64_t Tick
Tick count type.
Definition: types.hh:61
virtual void unserialize(CheckpointIn &cp) override
Unserialize an object.
Definition: dispatcher.cc:96
Bitfield< 27 > q
int numWg[3]
Definition: ndrange.hh:50
void deschedule(Event &event)
Definition: eventq.hh:943
volatile uint32_t * numDispLeft
Definition: ndrange.hh:65
Addr getAddr() const
Definition: packet.hh:720
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition: eventq.hh:198
static void setInstance(GpuDispatcher *_instance)
Definition: dispatcher.hh:112
void schedule(Event &event, Tick when)
Definition: eventq.hh:934
uint32_t wgSize[3]
Definition: qstruct.hh:59
uint32_t gdSize[3]
Definition: qstruct.hh:57
std::queue< int > doneIds
Definition: dispatcher.hh:74
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: dma_device.cc:277
Bitfield< 0 > w
Stats::Scalar num_kernelLaunched
Definition: dispatcher.hh:96
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:249
Bitfield< 10, 5 > event
static GpuDispatcher * getInstance()
Definition: dispatcher.hh:117
bool wg_disp_rem
Definition: ndrange.hh:60
Bitfield< 15 > system
Definition: misc.hh:997
#define SERIALIZE_SCALAR(scalar)
Definition: serialize.hh:763
uint32_t globalWgId
Definition: ndrange.hh:57
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:459
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:276
virtual const std::string name() const
Definition: sim_object.hh:129
Shader * shader
Definition: dispatcher.hh:81
EventFunctionWrapper tickEvent
Definition: dispatcher.hh:83
int numWgTotal
Definition: ndrange.hh:52
std::ostream CheckpointOut
Definition: serialize.hh:63
void scheduleDispatch()
Definition: dispatcher.cc:344
Tick ticks(int numCycles) const
Definition: shader.hh:91
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
Definition: dispatcher.cc:110
volatile bool * addrToNotify
Definition: ndrange.hh:64
T divCeil(const T &a, const U &b)
Definition: intmath.hh:99
bool dispatchActive
Definition: dispatcher.hh:78
HsaQueueEntry q
Definition: ndrange.hh:45
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
TLBPort * tlbPort
Definition: dispatcher.hh:137
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:309
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:235
Tick read(PacketPtr pkt) override
Pure virtual function that the device must implement.
Definition: dispatcher.cc:123
std::unordered_map< int, NDRange > ndRangeMap
Definition: dispatcher.hh:68
uint64_t numDispLeft
Definition: qstruct.hh:82
static GpuDispatcher * instance
Definition: dispatcher.hh:86
NDRange ndRange
Definition: dispatcher.hh:69
bool dispatch_workgroups(NDRange *ndr)
Definition: shader.cc:172
Bitfield< 0 > p
GpuDispatcher(const Params *p)
Definition: dispatcher.cc:47
int wgId[3]
Definition: ndrange.hh:48
Tick when() const
Get the time that the event is scheduled.
Definition: eventq.hh:499
void hostWakeUp(BaseCPU *cpu)
Definition: shader.cc:130
void allocate()
Allocate memory for the packet.
Definition: packet.hh:1226
void ReadMem(uint64_t address, void *ptr, uint32_t sz, int cu_id)
Definition: shader.cc:350
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: dispatcher.cc:251
void handshake(GpuDispatcher *dispatcher)
Definition: shader.cc:221
int dispatchId
Definition: ndrange.hh:66

Generated on Fri Jul 3 2020 15:53:02 for gem5 by doxygen 1.8.13