gem5  v22.1.0.0
gpu_command_processor.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <cassert>
35 
37 #include "base/chunk_generator.hh"
38 #include "debug/GPUCommandProc.hh"
39 #include "debug/GPUKernelInfo.hh"
42 #include "mem/abstract_mem.hh"
43 #include "mem/packet_access.hh"
46 #include "params/GPUCommandProcessor.hh"
47 #include "sim/full_system.hh"
48 #include "sim/process.hh"
49 #include "sim/proxy_ptr.hh"
50 #include "sim/syscall_emul_buf.hh"
51 
52 namespace gem5
53 {
54 
56  : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
57  walker(p.walker), hsaPP(p.hsapp)
58 {
59  assert(hsaPP);
60  hsaPP->setDevice(this);
62 }
63 
66 {
67  return *hsaPP;
68 }
69 
75 {
76  return gpuDevice->vramRequestorId();
77 }
78 
81 {
82  if (!FullSystem) {
83  // Grab the process and try to translate the virtual address with it;
84  // with new extensions, it will likely be wrong to just arbitrarily
85  // grab context zero.
86  auto process = sys->threads[0]->getProcessPtr();
87 
88  return process->pTable->translateRange(vaddr, size);
89  }
90 
91  // In full system use the page tables setup by the kernel driver rather
92  // than the CPU page tables.
93  return TranslationGenPtr(
95  1 /* vmid */, vaddr, size));
96 }
97 
115 void
116 GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
117  Addr host_pkt_addr)
118 {
119  static int dynamic_task_id = 0;
120  _hsa_dispatch_packet_t *disp_pkt = (_hsa_dispatch_packet_t*)raw_pkt;
121  assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
122 
127  auto *tc = sys->threads[0];
128 
129  TranslatingPortProxy fs_proxy(tc);
130  SETranslatingPortProxy se_proxy(tc);
131  PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
132 
138  bool is_system_page = true;
139  Addr phys_addr = disp_pkt->kernel_object;
140  if (FullSystem) {
147  int vmid = 1;
148  unsigned tmp_bytes;
150  phys_addr, tmp_bytes, BaseMMU::Mode::Read,
151  is_system_page);
152  }
153 
154  DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
155  disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
156  is_system_page);
157 
167  AMDKernelCode akc;
168  if (is_system_page) {
169  DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
170  virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
171  sizeof(AMDKernelCode));
172  } else {
173  assert(FullSystem);
174  DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
175 
176  // Read from GPU memory manager one cache line at a time to prevent
177  // rare cases where the AKC spans two memory pages.
178  ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
179  system()->cacheLineSize());
180  for (; !gen.done(); gen.next()) {
181  Addr chunk_addr = gen.addr();
182  int vmid = 1;
183  unsigned dummy;
185  chunk_addr, dummy, BaseMMU::Mode::Read,
186  is_system_page);
187 
189  RequestPtr request = std::make_shared<Request>(chunk_addr,
190  system()->cacheLineSize(), flags, walker->getDevRequestor());
191  Packet *readPkt = new Packet(request, MemCmd::ReadReq);
192  readPkt->dataStatic((uint8_t *)&akc + gen.complete());
193  system()->getDeviceMemory(readPkt)->access(readPkt);
194  delete readPkt;
195  }
196  }
197 
198  DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
199  "kernel object\n", akc.kernel_code_entry_byte_offset);
200 
201  DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
202  (uint64_t)tc->cpuId());
203 
204 
205  Addr machine_code_addr = (Addr)disp_pkt->kernel_object
207 
208  DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
209  machine_code_addr);
210 
211  std::string kernel_name;
212 
223  kernel_name = "Some kernel";
224  } else {
225  kernel_name = "Blit kernel";
226  }
227 
228  DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
229 
230  HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
231  dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr);
232 
233  DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
234  "grid size (%dx%dx%d) kernarg addr: %#x, completion "
235  "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
236  disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
237  disp_pkt->grid_size_x, disp_pkt->grid_size_y,
238  disp_pkt->grid_size_z, disp_pkt->kernarg_address,
239  disp_pkt->completion_signal);
240 
241  DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
242  "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
243  "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
244  task->numScalarRegs(), task->codeAddr(), 0, 0);
245 
246  initABI(task);
247  ++dynamic_task_id;
248 }
249 
250 uint64_t
252 {
253  Addr value_addr = getHsaSignalValueAddr(signal_handle);
254  auto tc = system()->threads[0];
255  ConstVPtr<Addr> prev_value(value_addr, tc);
256  return *prev_value;
257 }
258 
259 void
260 GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
261  HsaSignalCallbackFunction function)
262 {
263  // The signal value is aligned 8 bytes from
264  // the actual handle in the runtime
265  Addr value_addr = getHsaSignalValueAddr(signal_handle);
266  Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
267  Addr event_addr = getHsaSignalEventAddr(signal_handle);
268  DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
269 
270  auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);
271 
272  dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
273 
274  auto tc = system()->threads[0];
275  ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
276 
277  // Notifying an event with its mailbox pointer is
278  // not supported in the current implementation. Just use
279  // mailbox pointer to distinguish between interruptible
280  // and default signal. Interruptible signal will have
281  // a valid mailbox pointer.
282  if (*mailbox_ptr != 0) {
283  // This is an interruptible signal. Now, read the
284  // event ID and directly communicate with the driver
285  // about that event notification.
286  ConstVPtr<uint32_t> event_val(event_addr, tc);
287 
288  DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
289  "signal event value %d\n", *event_val);
290 
291  // The mailbox/wakeup signal uses the SE mode proxy port to write
292  // the event value. This is not available in full system mode so
293  // instead we need to issue a DMA write to the address. The value of
294  // *event_val clears the event.
295  if (FullSystem) {
296  auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
297  dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
298  } else {
299  signalWakeupEvent(*event_val);
300  }
301  }
302 }
303 
304 void
306 {
307  fatal_if(_driver, "Should not overwrite driver.");
308  // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
309  // Should get rid of the base class.
310  _driver = gpu_driver;
311  assert(_driver);
312 }
313 
316 {
317  return _driver;
318 }
319 
337 void
338 GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
339  Addr host_pkt_addr)
340 {
341  hsaPP->finishPkt(raw_pkt, queue_id);
342 }
343 
351 void
352 GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
353  Addr host_pkt_addr)
354 {
355  //Parse the Packet, see what it wants us to do
356  _hsa_agent_dispatch_packet_t * agent_pkt =
357  (_hsa_agent_dispatch_packet_t *)raw_pkt;
358 
359  if (agent_pkt->type == AgentCmd::Nop) {
360  DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
361  } else if (agent_pkt->type == AgentCmd::Steal) {
362  //This is where we steal the HSA Task's completion signal
363  int kid = agent_pkt->arg[0];
364  DPRINTF(GPUCommandProc,
365  "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
366  kid);
367 
368  HSAQueueEntry *task = dispatcher.hsaTask(kid);
369  uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
370 
371  uint64_t return_address = agent_pkt->return_address;
372  DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
373  //*return_address = signal_addr;
374  Addr *new_signal_addr = new Addr;
375  *new_signal_addr = (Addr)signal_addr;
376  dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
377 
378  DPRINTF(GPUCommandProc,
379  "Agent Dispatch Packet Stealing signal handle from kid %d :" \
380  "(%x:%x) writing into %x\n",
381  kid,signal_addr,new_signal_addr,return_address);
382 
383  } else
384  {
385  panic("The agent dispatch packet provided an unknown argument in" \
386  "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
387  }
388 
389  hsaPP->finishPkt(raw_pkt, queue_id);
390 }
391 
398 void
400 {
401  dispatcher.dispatch(task);
402 }
403 
404 void
406 {
407  _driver->signalWakeupEvent(event_id);
408 }
409 
416 void
418 {
419  auto cb = new DmaVirtCallback<uint32_t>(
420  [ = ] (const uint32_t &readDispIdOffset)
421  { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
422 
423  Addr hostReadIdxPtr
424  = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
425 
426  dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
427  sizeof(uint32_t), cb, &cb->dmaBuffer);
428 }
429 
430 System*
432 {
433  return sys;
434 }
435 
438 {
439  AddrRangeList ranges;
440  return ranges;
441 }
442 
443 void
445 {
446  gpuDevice = gpu_device;
448 }
449 
450 void
452 {
453  _shader = shader;
454 }
455 
456 Shader*
458 {
459  return _shader;
460 }
461 
462 } // namespace gem5
AbstractMemory declaration.
#define DPRINTF(x,...)
Definition: trace.hh:186
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
AMDGPUVM & getVM()
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Addr getPageTableBase(uint16_t vmid)
Definition: amdgpu_vm.hh:272
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
DmaDeviceParams Params
Definition: dma_device.hh:209
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Addr getHsaSignalMailboxAddr(Addr signal_handle)
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
HSAPacketProcessor & hsaPacketProc()
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
Addr getHsaSignalValueAddr(Addr signal_handle)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
virtual void signalWakeupEvent(uint32_t event_id)
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
Definition: dispatcher.cc:114
HSAQueueEntry * hsaTask(int disp_id)
Definition: dispatcher.cc:63
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
Definition: dispatcher.cc:70
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
uint32_t queueId() const
Addr completionSignal() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition: packet.hh:294
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition: packet.hh:1162
System * sys
Definition: io_device.hh:105
This object is a proxy for a port or other object which implements the functional response protocol,...
Definition: port_proxy.hh:87
void readBlob(Addr addr, void *p, int size) const
Higher level interfaces based on the above.
Definition: port_proxy.hh:182
@ PHYSICAL
The virtual address is also the physical address.
Definition: request.hh:117
memory::AbstractMemory * getDeviceMemory(const PacketPtr &pkt) const
Return a pointer to the device memory.
Definition: system.cc:311
Threads threads
Definition: system.hh:313
This proxy attempts to translate virtual addresses using the TLBs.
void setDevRequestor(RequestorID mid)
Fault startFunctional(Addr base, Addr vaddr, PageTableEntry &pte, unsigned &logBytes, BaseMMU::Mode mode)
RequestorID getDevRequestor() const
void access(PacketPtr pkt)
Perform an untimed memory access and update all the state (e.g.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:178
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
uint8_t flags
Definition: helpers.cc:66
Bitfield< 54 > p
Definition: pagetable.hh:70
ProbePointArg< PacketInfo > Packet
Packet probe point.
Definition: mem.hh:109
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition: root.cc:220
uint16_t RequestorID
Definition: request.hh:95
std::unique_ptr< TranslationGen > TranslationGenPtr
def format Nop(code, *opt_flags)
Definition: nop.cc:82
int64_t kernel_code_entry_byte_offset
Definition: kernel_code.hh:90
uint64_t runtime_loader_kernel_symbol
Definition: kernel_code.hh:187
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1