gem5 v24.1.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
gpu_command_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35
38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUInitAbi.hh"
41#include "debug/GPUKernelInfo.hh"
45#include "gpu-compute/shader.hh"
46#include "mem/abstract_mem.hh"
47#include "mem/packet_access.hh"
50#include "params/GPUCommandProcessor.hh"
51#include "sim/full_system.hh"
52#include "sim/process.hh"
53#include "sim/proxy_ptr.hh"
54#include "sim/sim_exit.hh"
56
57namespace gem5
58{
59
61 : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
62 walker(p.walker), hsaPP(p.hsapp),
63 target_non_blit_kernel_id(p.target_non_blit_kernel_id)
64{
65 assert(hsaPP);
66 hsaPP->setDevice(this);
68}
69
75
84
87{
88 if (!FullSystem) {
89 // Grab the process and try to translate the virtual address with it;
90 // with new extensions, it will likely be wrong to just arbitrarily
91 // grab context zero.
92 auto process = sys->threads[0]->getProcessPtr();
93
94 return process->pTable->translateRange(vaddr, size);
95 }
96
97 // In full system use the page tables setup by the kernel driver rather
98 // than the CPU page tables.
99 return TranslationGenPtr(
101 1 /* vmid */, vaddr, size));
102}
103
104void
106{
107 // Use the shader to access the CUs and call the read request from
108 // the SQC port. Call submit kernel dispatch in the timing response
109 // function in receive timing response of SQC port. Schedule this
110 // timing read when...just currTick
111 ComputeUnit *cu = shader()->cuList[0];
113 cu->wfList[0][0], true);
115 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->senderState);
116 ComputeUnit::SQCPort sqc_port = cu->sqcPort;
117 if (!sqc_port.sendTimingReq(pkt)) {
118 sqc_port.retries.push_back(
120 sender_state->wavefront));
121 }
122}
123
124void
126{
127 struct KernelDispatchData dispatchData = kernelDispatchList.front();
128 kernelDispatchList.pop_front();
129 delete dispatchData.readPkt;
130 if (kernelDispatchList.size() == 0)
131 dispatchKernelObject(dispatchData.akc, dispatchData.raw_pkt,
132 dispatchData.queue_id, dispatchData.host_pkt_addr);
133}
134
152void
153GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
154 Addr host_pkt_addr)
155{
157 // The kernel object should be aligned to a 64B boundary, but not
158 // necessarily a cache line boundary.
159 unsigned akc_alignment_granularity = 64;
160 assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
161
169 if (shader()->getNumOutstandingInvL2s() > 0) {
170 DPRINTF(GPUCommandProc,
171 "Deferring kernel launch due to outstanding L2 invalidates\n");
172 shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
173
174 return;
175 }
176
181 AMDKernelCode *akc = new AMDKernelCode;
182
197 if (!FullSystem) {
202 auto *tc = sys->threads[0];
203 SETranslatingPortProxy virt_proxy(tc);
204
205 DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
206 virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
207 sizeof(AMDKernelCode));
208
209 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
210 } else {
217 bool is_system_page = true;
218 Addr phys_addr = disp_pkt->kernel_object;
219
226 int vmid = 1;
227 unsigned tmp_bytes;
229 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
230 is_system_page);
231
232 DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
233 " s:%d\n", disp_pkt->kernel_object, phys_addr,
234 sizeof(AMDKernelCode), is_system_page);
235
240 if (is_system_page) {
241 DPRINTF(GPUCommandProc,
242 "sending system DMA read for kernel_object\n");
243
244 auto dma_callback = new DmaVirtCallback<uint32_t>(
245 [=](const uint32_t&) {
246 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
247 });
248
249 dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
250 dma_callback, (void *)akc);
251 } else {
252 DPRINTF(GPUCommandProc,
253 "kernel_object in device, using device mem\n");
254
255 // Read from GPU memory manager one cache line at a time to prevent
256 // rare cases where the AKC spans two memory pages.
257 ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
258 akc_alignment_granularity);
259 for (; !gen.done(); gen.next()) {
260 Addr chunk_addr = gen.addr();
261 int vmid = 1;
262 unsigned dummy;
264 gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
265 dummy, BaseMMU::Mode::Read, is_system_page);
266
268 RequestPtr request = std::make_shared<Request>(chunk_addr,
269 akc_alignment_granularity, flags,
271 PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);
272 readPkt->dataStatic((uint8_t *)akc + gen.complete());
273 // If the request spans two device memories, the device memory
274 // returned will be null.
275 assert(system()->getDeviceMemory(readPkt) != nullptr);
276 struct KernelDispatchData dispatchData;
277 dispatchData.akc = akc;
278 dispatchData.raw_pkt = raw_pkt;
279 dispatchData.queue_id = queue_id;
280 dispatchData.host_pkt_addr = host_pkt_addr;
281 dispatchData.readPkt = readPkt;
282 kernelDispatchList.push_back(dispatchData);
283 performTimingRead(readPkt);
284 }
285 }
286 }
287}
288
289void
291 uint32_t queue_id, Addr host_pkt_addr)
292{
294
295 sanityCheckAKC(akc);
296
297 DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
298 "kernel object\n", akc->kernel_code_entry_byte_offset);
299
300 Addr machine_code_addr = (Addr)disp_pkt->kernel_object
302
303 DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
304 machine_code_addr);
305
306 std::string kernel_name;
307
317 bool is_blit_kernel;
318 if (!disp_pkt->completion_signal) {
319 kernel_name = "Some kernel";
320 is_blit_kernel = false;
321 } else {
322 kernel_name = "Blit kernel";
323 is_blit_kernel = true;
324 }
325
326 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
327
328 GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
329 : driver()->getGfxVersion();
330 HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
331 dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
332 gfxVersion);
333
334 // The driver expects the start time to be in ns
335 Tick start_ts = curTick() / sim_clock::as_int::ns;
336 dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
337
338 // Potentially skip a non-blit kernel
339 if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
340 DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
342
343 // Notify the HSA PP that this kernel is complete
344 hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
345 if (task->completionSignal()) {
346 DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
347 "signal! Addr: %d\n", task->completionSignal());
348
350 } else {
351 DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
352 "signal\n");
353 }
354
357
358 delete akc;
359
360 // Notify the run script that a kernel has been skipped
361 exitSimLoop("Skipping GPU Kernel");
362
363 return;
364 }
365
366 DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
367 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
368 "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
369 disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
370 disp_pkt->grid_size_x, disp_pkt->grid_size_y,
371 disp_pkt->grid_size_z, disp_pkt->kernarg_address,
372 disp_pkt->completion_signal);
373
374 DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
375 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
376 "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
377 task->numScalarRegs(), task->codeAddr(), 0, 0);
378
379 initABI(task);
381 if (!is_blit_kernel) ++non_blit_kernel_id;
382
383 delete akc;
384}
385
386void
388{
389 // Originally the completion signal was read functionally and written
390 // with a timing DMA. This can cause issues in FullSystem mode and
391 // cause translation failures. Therefore, in FullSystem mode everything
392 // is done in timing mode.
393
394 if (!FullSystem) {
401 uint64_t signal_value = functionalReadHsaSignal(signal_handle);
402
403 updateHsaSignal(signal_handle, signal_value - 1);
404 } else {
405 // The semantics of the HSA signal is to decrement the current
406 // signal value by one. Do this asynchronously via DMAs and
407 // callbacks as we can safely continue with this function
408 // while waiting for the next packet from the host.
409 updateHsaSignalAsync(signal_handle, -1);
410 }
411}
412
413void
415{
416 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
417 uint64_t *mailboxValue = new uint64_t;
418 auto cb2 = new DmaVirtCallback<uint64_t>(
419 [ = ] (const uint64_t &)
420 { updateHsaMailboxData(signal_handle, mailboxValue); });
421 dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue);
422 DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n",
423 mailbox_addr);
424}
425
426void
428 uint64_t *mailbox_value)
429{
430 Addr event_addr = getHsaSignalEventAddr(signal_handle);
431
432 DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value);
433 if (*mailbox_value != 0) {
434 // This is an interruptible signal. Now, read the
435 // event ID and directly communicate with the driver
436 // about that event notification.
437 auto cb = new DmaVirtCallback<uint64_t>(
438 [ = ] (const uint64_t &)
439 { updateHsaEventData(signal_handle, mailbox_value); });
440 dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
441 } else {
442 delete mailbox_value;
443
444 Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
445
446 amd_event_t *event_ts = new amd_event_t;
447 event_ts->start_ts = dispatchStartTime[signal_handle];
448 event_ts->end_ts = curTick() / sim_clock::as_int::ns;
449 auto cb = new DmaVirtCallback<uint64_t>(
450 [ = ] (const uint64_t &)
451 { updateHsaEventTs(signal_handle, event_ts); });
452 dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts);
453 DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr "
454 "%lx\n", ts_addr);
455
456 dispatchStartTime.erase(signal_handle);
457 }
458}
459
460void
462 uint64_t *event_value)
463{
464 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
465
466 DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value);
467 // Write *event_value to the mailbox to clear the event
468 auto cb = new DmaVirtCallback<uint64_t>(
469 [ = ] (const uint64_t &)
470 { updateHsaSignalDone(event_value); }, *event_value);
471 dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
472
473 Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
474
475 amd_event_t *event_ts = new amd_event_t;
476 event_ts->start_ts = dispatchStartTime[signal_handle];
477 event_ts->end_ts = curTick() / sim_clock::as_int::ns;
478 auto cb2 = new DmaVirtCallback<uint64_t>(
479 [ = ] (const uint64_t &)
480 { updateHsaEventTs(signal_handle, event_ts); });
481 dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts);
482 DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n",
483 ts_addr);
484
485 dispatchStartTime.erase(signal_handle);
486}
487
488void
491{
492 delete ts;
493
494 Addr value_addr = getHsaSignalValueAddr(signal_handle);
495 int64_t diff = -1;
496
497 uint64_t *signalValue = new uint64_t;
498 auto cb = new DmaVirtCallback<uint64_t>(
499 [ = ] (const uint64_t &)
500 { updateHsaSignalData(value_addr, diff, signalValue); });
501 dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
502 DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
503 value_addr);
504}
505
506void
508 uint64_t *prev_value)
509{
510 // Reuse the value allocated for the read
511 DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
512 *prev_value, *prev_value + diff);
513 *prev_value += diff;
514 auto cb = new DmaVirtCallback<uint64_t>(
515 [ = ] (const uint64_t &)
516 { updateHsaSignalDone(prev_value); });
517 dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
518}
519
520void
522{
523 delete signal_value;
524}
525
526uint64_t
528{
529 Addr value_addr = getHsaSignalValueAddr(signal_handle);
530 auto tc = system()->threads[0];
531 ConstVPtr<Addr> prev_value(value_addr, tc);
532 return *prev_value;
533}
534
535void
536GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
538{
539 // The signal value is aligned 8 bytes from
540 // the actual handle in the runtime
541 Addr value_addr = getHsaSignalValueAddr(signal_handle);
542 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
543 Addr event_addr = getHsaSignalEventAddr(signal_handle);
544 DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
545
546 auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);
547
548 dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
549
550 auto tc = system()->threads[0];
551 ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
552
553 // Notifying an event with its mailbox pointer is
554 // not supported in the current implementation. Just use
555 // mailbox pointer to distinguish between interruptible
556 // and default signal. Interruptible signal will have
557 // a valid mailbox pointer.
558 if (*mailbox_ptr != 0) {
559 // This is an interruptible signal. Now, read the
560 // event ID and directly communicate with the driver
561 // about that event notification.
562 ConstVPtr<uint32_t> event_val(event_addr, tc);
563
564 DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
565 "signal event value %d\n", *event_val);
566
567 // The mailbox/wakeup signal uses the SE mode proxy port to write
568 // the event value. This is not available in full system mode so
569 // instead we need to issue a DMA write to the address. The value of
570 // *event_val clears the event.
571 if (FullSystem) {
572 auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
573 dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
574 } else {
575 signalWakeupEvent(*event_val);
576 }
577 }
578}
579
580void
582{
583 fatal_if(_driver, "Should not overwrite driver.");
584 // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
585 // Should get rid of the base class.
586 _driver = gpu_driver;
587 assert(_driver);
588}
589
592{
593 return _driver;
594}
595
614void
615GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
616 Addr host_pkt_addr)
617{
618 auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;
619
620 if (vendor_pkt->completion_signal) {
621 sendCompletionSignal(vendor_pkt->completion_signal);
622 }
623
624 warn("Ignoring vendor packet\n");
625
626 hsaPP->finishPkt(raw_pkt, queue_id);
627}
628
636void
637GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
638 Addr host_pkt_addr)
639{
640 //Parse the Packet, see what it wants us to do
641 _hsa_agent_dispatch_packet_t * agent_pkt =
643
644 if (agent_pkt->type == AgentCmd::Nop) {
645 DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
646 } else if (agent_pkt->type == AgentCmd::Steal) {
647 //This is where we steal the HSA Task's completion signal
648 int kid = agent_pkt->arg[0];
649 DPRINTF(GPUCommandProc,
650 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
651 kid);
652
653 HSAQueueEntry *task = dispatcher.hsaTask(kid);
654 uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
655
656 uint64_t return_address = agent_pkt->return_address;
657 DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
658 //*return_address = signal_addr;
659 Addr *new_signal_addr = new Addr;
660 *new_signal_addr = (Addr)signal_addr;
661 dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
662
663 DPRINTF(GPUCommandProc,
664 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
665 "(%x:%x) writing into %x\n",
666 kid,signal_addr,new_signal_addr,return_address);
667
668 } else
669 {
670 panic("The agent dispatch packet provided an unknown argument in" \
671 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
672 }
673
674 hsaPP->finishPkt(raw_pkt, queue_id);
675}
676
683void
688
689void
691{
692 _driver->signalWakeupEvent(event_id);
693}
694
701void
703{
704 auto cb = new DmaVirtCallback<uint32_t>(
705 [ = ] (const uint32_t &readDispIdOffset)
706 { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
707
708 Addr hostReadIdxPtr
709 = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
710
711 dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
712 sizeof(uint32_t), cb, &cb->dmaBuffer);
713}
714
715void
717{
718 DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
720 DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
722 DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
723 DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
725 DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
726 DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
727 DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
729 DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
731 DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
732 DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
733 DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
735 DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
737 DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
739 DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
740 DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
741 DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
742 DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
743 DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
744 DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
745 DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
746 DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
747 DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
748 DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
749 DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
751 DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
752 DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
753 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
755 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
757 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
759 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
761 DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
763 DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
765 DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
767 DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
768 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
770 DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
772 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
774 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
776 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
778 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
780 DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
782 DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
784 DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
786 DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
788 DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
790 DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
792 DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
794 DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
796 DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
798 DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
799 DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
801 DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
803
804
805 // Check for features not implemented in gem5
806 fatal_if(akc->wgp_mode, "WGP mode not supported\n");
807 fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
808 fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
809
810
811 // Warn on features that gem5 will ignore
812 warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
813 warn_if(akc->bulky, "Bulky code object bit ignored\n");
814 // TODO: All the IEEE bits
815
818 "Kernarg preload not implemented\n");
819 warn_if(akc->tg_split, "TG split not implemented\n");
820}
821
822System*
824{
825 return sys;
826}
827
830{
831 AddrRangeList ranges;
832 return ranges;
833}
834
835void
841
842void
847
848Shader*
850{
851 return _shader;
852}
853
854} // namespace gem5
AbstractMemory declaration.
#define DPRINTF(x,...)
Definition trace.hh:209
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
GfxVersion getGfxVersion() const
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Addr getPageTableBase(uint16_t vmid)
Definition amdgpu_vm.hh:283
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
std::deque< std::pair< PacketPtr, Wavefront * > > retries
std::vector< std::vector< Wavefront * > > wfList
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Addr getHsaSignalMailboxAddr(Addr signal_handle)
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void updateHsaSignalDone(uint64_t *signal_value)
HSAPacketProcessor & hsaPacketProc()
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
std::list< struct KernelDispatchData > kernelDispatchList
Addr getHsaSignalValueAddr(Addr signal_handle)
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
std::unordered_map< Addr, Tick > dispatchStartTime
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void sanityCheckAKC(AMDKernelCode *akc)
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
void performTimingRead(PacketPtr pkt)
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
GfxVersion getGfxVersion() const
virtual void signalWakeupEvent(uint32_t event_id)
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
HSAQueueEntry * hsaTask(int disp_id)
Definition dispatcher.cc:65
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
Definition dispatcher.cc:72
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
uint32_t queueId() const
Addr completionSignal() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:603
@ PHYSICAL
The virtual address is also the physical address.
Definition request.hh:117
std::vector< ComputeUnit * > cuList
Definition shader.hh:268
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:575
Threads threads
Definition system.hh:310
void setDevRequestor(RequestorID mid)
Fault startFunctional(Addr base, Addr vaddr, PageTableEntry &pte, unsigned &logBytes, BaseMMU::Mode mode)
RequestorID getDevRequestor() const
STL pair class.
Definition stl.hh:58
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
uint8_t flags
Definition helpers.cc:87
#define warn(...)
Definition logging.hh:256
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:283
Bitfield< 55, 52 > ts
Bitfield< 0 > p
Tick ns
nanosecond
Definition core.cc:68
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
struct gem5::GEM5_PACKED AMDKernelCode
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
std::unique_ptr< TranslationGen > TranslationGenPtr
PM4 packets.
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_exception_memory
uint32_t enable_exception_fp_denormal_source
uint32_t enable_exception_address_watch
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_exception_ieee_754_fp_inexact
uint32_t enable_ieee_mode
uint32_t group_segment_fixed_size
uint32_t float_mode_round_32
uint32_t enable_exception_int_divide_by_zero
uint32_t granulated_wavefront_sgpr_count
uint32_t granulated_workitem_vgpr_count
uint32_t kernarg_preload_spec_length
uint32_t enable_sgpr_dispatch_ptr
uint32_t use_dynamic_stack
uint32_t granulated_lds_size
uint32_t float_mode_round_16_64
uint32_t float_mode_denorm_16_64
uint32_t float_mode_denorm_32
uint32_t kernarg_preload_spec_offset
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t accum_offset
uint32_t user_sgpr_count
uint32_t enable_dx10_clamp
uint32_t enable_exception_ieee_754_fp_overflow
int64_t kernel_code_entry_byte_offset
uint32_t enable_exception_ieee_754_fp_underflow
uint32_t enable_vgpr_workitem_id
uint32_t kernarg_size
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_exception_ieee_754_fp_division_by_zero
uint32_t private_segment_fixed_size
uint32_t enable_trap_handler
uint32_t enable_sgpr_workgroup_info
uint32_t enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_wavefront_size32
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Mon Jan 13 2025 04:28:36 for gem5 by doxygen 1.9.8