gem5 v23.0.0.1
Loading...
Searching...
No Matches
gpu_command_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35
38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUKernelInfo.hh"
42#include "mem/abstract_mem.hh"
43#include "mem/packet_access.hh"
46#include "params/GPUCommandProcessor.hh"
47#include "sim/full_system.hh"
48#include "sim/process.hh"
49#include "sim/proxy_ptr.hh"
51
52namespace gem5
53{
54
56 : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
57 walker(p.walker), hsaPP(p.hsapp)
58{
59 assert(hsaPP);
60 hsaPP->setDevice(this);
62}
63
66{
67 return *hsaPP;
68}
69
75{
76 return gpuDevice->vramRequestorId();
77}
78
81{
82 if (!FullSystem) {
83 // Grab the process and try to translate the virtual address with it;
84 // with new extensions, it will likely be wrong to just arbitrarily
85 // grab context zero.
86 auto process = sys->threads[0]->getProcessPtr();
87
88 return process->pTable->translateRange(vaddr, size);
89 }
90
91 // In full system use the page tables setup by the kernel driver rather
92 // than the CPU page tables.
93 return TranslationGenPtr(
95 1 /* vmid */, vaddr, size));
96}
97
115void
116GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
117 Addr host_pkt_addr)
118{
119 static int dynamic_task_id = 0;
121 assert(!(disp_pkt->kernel_object & (system()->cacheLineSize() - 1)));
122
127 auto *tc = sys->threads[0];
128
129 TranslatingPortProxy fs_proxy(tc);
130 SETranslatingPortProxy se_proxy(tc);
131 PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
132
138 bool is_system_page = true;
139 Addr phys_addr = disp_pkt->kernel_object;
140 if (FullSystem) {
147 int vmid = 1;
148 unsigned tmp_bytes;
150 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
151 is_system_page);
152 }
153
154 DPRINTF(GPUCommandProc, "kernobj vaddr %#lx paddr %#lx size %d s:%d\n",
155 disp_pkt->kernel_object, phys_addr, sizeof(AMDKernelCode),
156 is_system_page);
157
167 AMDKernelCode akc;
168 if (is_system_page) {
169 DPRINTF(GPUCommandProc, "kernel_object in system, using proxy\n");
170 virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)&akc,
171 sizeof(AMDKernelCode));
172 } else {
173 assert(FullSystem);
174 DPRINTF(GPUCommandProc, "kernel_object in device, using device mem\n");
175
176 // Read from GPU memory manager one cache line at a time to prevent
177 // rare cases where the AKC spans two memory pages.
178 ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
179 system()->cacheLineSize());
180 for (; !gen.done(); gen.next()) {
181 Addr chunk_addr = gen.addr();
182 int vmid = 1;
183 unsigned dummy;
185 chunk_addr, dummy, BaseMMU::Mode::Read,
186 is_system_page);
187
189 RequestPtr request = std::make_shared<Request>(chunk_addr,
190 system()->cacheLineSize(), flags, walker->getDevRequestor());
191 Packet *readPkt = new Packet(request, MemCmd::ReadReq);
192 readPkt->dataStatic((uint8_t *)&akc + gen.complete());
193 system()->getDeviceMemory(readPkt)->access(readPkt);
194 delete readPkt;
195 }
196 }
197
198 DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
199 "kernel object\n", akc.kernel_code_entry_byte_offset);
200
201 DPRINTF(GPUCommandProc,"GPUCommandProc: Sending dispatch pkt to %lu\n",
202 (uint64_t)tc->cpuId());
203
204
205 Addr machine_code_addr = (Addr)disp_pkt->kernel_object
207
208 DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
209 machine_code_addr);
210
211 std::string kernel_name;
212
223 kernel_name = "Some kernel";
224 } else {
225 kernel_name = "Blit kernel";
226 }
227
228 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
229
230 GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
231 : driver()->getGfxVersion();
232 HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
233 dynamic_task_id, raw_pkt, &akc, host_pkt_addr, machine_code_addr,
234 gfxVersion);
235
236 DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
237 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
238 "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
239 disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
240 disp_pkt->grid_size_x, disp_pkt->grid_size_y,
241 disp_pkt->grid_size_z, disp_pkt->kernarg_address,
242 disp_pkt->completion_signal);
243
244 DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
245 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
246 "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
247 task->numScalarRegs(), task->codeAddr(), 0, 0);
248
249 initABI(task);
250 ++dynamic_task_id;
251}
252
253uint64_t
255{
256 Addr value_addr = getHsaSignalValueAddr(signal_handle);
257 auto tc = system()->threads[0];
258 ConstVPtr<Addr> prev_value(value_addr, tc);
259 return *prev_value;
260}
261
262void
263GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
265{
266 // The signal value is aligned 8 bytes from
267 // the actual handle in the runtime
268 Addr value_addr = getHsaSignalValueAddr(signal_handle);
269 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
270 Addr event_addr = getHsaSignalEventAddr(signal_handle);
271 DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
272
273 auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);
274
275 dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
276
277 auto tc = system()->threads[0];
278 ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
279
280 // Notifying an event with its mailbox pointer is
281 // not supported in the current implementation. Just use
282 // mailbox pointer to distinguish between interruptible
283 // and default signal. Interruptible signal will have
284 // a valid mailbox pointer.
285 if (*mailbox_ptr != 0) {
286 // This is an interruptible signal. Now, read the
287 // event ID and directly communicate with the driver
288 // about that event notification.
289 ConstVPtr<uint32_t> event_val(event_addr, tc);
290
291 DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
292 "signal event value %d\n", *event_val);
293
294 // The mailbox/wakeup signal uses the SE mode proxy port to write
295 // the event value. This is not available in full system mode so
296 // instead we need to issue a DMA write to the address. The value of
297 // *event_val clears the event.
298 if (FullSystem) {
299 auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
300 dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
301 } else {
302 signalWakeupEvent(*event_val);
303 }
304 }
305}
306
307void
309{
310 fatal_if(_driver, "Should not overwrite driver.");
311 // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
312 // Should get rid of the base class.
313 _driver = gpu_driver;
314 assert(_driver);
315}
316
319{
320 return _driver;
321}
322
340void
341GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
342 Addr host_pkt_addr)
343{
344 hsaPP->finishPkt(raw_pkt, queue_id);
345}
346
354void
355GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
356 Addr host_pkt_addr)
357{
358 //Parse the Packet, see what it wants us to do
359 _hsa_agent_dispatch_packet_t * agent_pkt =
361
362 if (agent_pkt->type == AgentCmd::Nop) {
363 DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
364 } else if (agent_pkt->type == AgentCmd::Steal) {
365 //This is where we steal the HSA Task's completion signal
366 int kid = agent_pkt->arg[0];
367 DPRINTF(GPUCommandProc,
368 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
369 kid);
370
371 HSAQueueEntry *task = dispatcher.hsaTask(kid);
372 uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
373
374 uint64_t return_address = agent_pkt->return_address;
375 DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
376 //*return_address = signal_addr;
377 Addr *new_signal_addr = new Addr;
378 *new_signal_addr = (Addr)signal_addr;
379 dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
380
381 DPRINTF(GPUCommandProc,
382 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
383 "(%x:%x) writing into %x\n",
384 kid,signal_addr,new_signal_addr,return_address);
385
386 } else
387 {
388 panic("The agent dispatch packet provided an unknown argument in" \
389 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
390 }
391
392 hsaPP->finishPkt(raw_pkt, queue_id);
393}
394
401void
403{
404 dispatcher.dispatch(task);
405}
406
407void
409{
410 _driver->signalWakeupEvent(event_id);
411}
412
419void
421{
422 auto cb = new DmaVirtCallback<uint32_t>(
423 [ = ] (const uint32_t &readDispIdOffset)
424 { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
425
426 Addr hostReadIdxPtr
427 = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
428
429 dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
430 sizeof(uint32_t), cb, &cb->dmaBuffer);
431}
432
433System*
435{
436 return sys;
437}
438
441{
442 AddrRangeList ranges;
443 return ranges;
444}
445
446void
448{
449 gpuDevice = gpu_device;
451}
452
453void
455{
456 _shader = shader;
457}
458
459Shader*
461{
462 return _shader;
463}
464
465} // namespace gem5
AbstractMemory declaration.
#define DPRINTF(x,...)
Definition trace.hh:210
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
GfxVersion getGfxVersion() const
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Addr getPageTableBase(uint16_t vmid)
Definition amdgpu_vm.hh:286
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Addr getHsaSignalMailboxAddr(Addr signal_handle)
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
HSAPacketProcessor & hsaPacketProc()
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
Addr getHsaSignalValueAddr(Addr signal_handle)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
GfxVersion getGfxVersion() const
virtual void signalWakeupEvent(uint32_t event_id)
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
HSAQueueEntry * hsaTask(int disp_id)
Definition dispatcher.cc:65
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
Definition dispatcher.cc:72
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
uint32_t queueId() const
Addr completionSignal() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
This object is a proxy for a port or other object which implements the functional response protocol,...
Definition port_proxy.hh:87
void readBlob(Addr addr, void *p, int size) const
Higher level interfaces based on the above.
@ PHYSICAL
The virtual address is also the physical address.
Definition request.hh:117
memory::AbstractMemory * getDeviceMemory(const PacketPtr &pkt) const
Return a pointer to the device memory.
Definition system.cc:311
Threads threads
Definition system.hh:310
This proxy attempts to translate virtual addresses using the TLBs.
void setDevRequestor(RequestorID mid)
Fault startFunctional(Addr base, Addr vaddr, PageTableEntry &pte, unsigned &logBytes, BaseMMU::Mode mode)
RequestorID getDevRequestor() const
void access(PacketPtr pkt)
Perform an untimed memory access and update all the state (e.g.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
uint8_t flags
Definition helpers.cc:66
Bitfield< 0 > p
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint16_t RequestorID
Definition request.hh:95
std::unique_ptr< TranslationGen > TranslationGenPtr
int64_t kernel_code_entry_byte_offset
uint64_t runtime_loader_kernel_symbol
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Mon Jul 10 2023 15:32:03 for gem5 by doxygen 1.9.7