gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
gpu_command_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35
38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUInitAbi.hh"
41#include "debug/GPUKernelInfo.hh"
45#include "gpu-compute/shader.hh"
46#include "mem/abstract_mem.hh"
47#include "mem/packet_access.hh"
50#include "params/GPUCommandProcessor.hh"
51#include "sim/full_system.hh"
52#include "sim/process.hh"
53#include "sim/proxy_ptr.hh"
54#include "sim/sim_exit.hh"
56
57namespace gem5
58{
59
62 walker(p.walker), hsaPP(p.hsapp),
64{
65 assert(hsaPP);
66 hsaPP->setDevice(this);
67 dispatcher.setCommandProcessor(this);
68}
69
75
81{
82 return gpuDevice->vramRequestorId();
83}
84
87{
88 if (!FullSystem) {
89 // Grab the process and try to translate the virtual address with it;
90 // with new extensions, it will likely be wrong to just arbitrarily
91 // grab context zero.
92 auto process = sys->threads[0]->getProcessPtr();
93
94 return process->pTable->translateRange(vaddr, size);
95 }
96
97 // In full system use the page tables setup by the kernel driver rather
98 // than the CPU page tables.
99 return TranslationGenPtr(
101 1 /* vmid */, vaddr, size));
102}
103
104void
106{
107 // Use the shader to access the CUs and call the read request from
108 // the SQC port. Call submit kernel dispatch in the timing response
109 // function in receive timing response of SQC port. Schedule this
110 // timing read when...just currTick
111 ComputeUnit *cu = shader()->cuList[0];
113 cu->wfList[0][0], true);
116 sender_state->dispatchType = dispType;
117 ComputeUnit::SQCPort sqc_port = cu->sqcPort;
118
119 if (!sqc_port.sendTimingReq(pkt)) {
120 sqc_port.retries.push_back(
122 );
123 }
124}
125
126void
128{
129 struct KernelDispatchData dispatchData = kernelDispatchList.front();
130 kernelDispatchList.pop_front();
131 delete dispatchData.readPkt;
132
133 // Only one of the following can happen at any time from one CP. Figure
134 // out what performed the timing read and call to appropriate function.
135 if (kernelDispatchList.size() == 0) {
136 switch (dispType) {
137 case ComputeUnit::SQCPort::SenderState::DISPATCH_KERNEL_OBJECT:
138 dispatchKernelObject(dispatchData.akc, dispatchData.raw_pkt,
139 dispatchData.queue_id, dispatchData.host_pkt_addr);
140 break;
141 case ComputeUnit::SQCPort::SenderState::DISPATCH_PRELOAD_ARG:
142 initPreload(dispatchData.akc, dispatchData.task);
143 break;
144 }
145 }
146}
147
165void
166GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
167 Addr host_pkt_addr)
168{
170 // The kernel object should be aligned to a 64B boundary, but not
171 // necessarily a cache line boundary.
172 unsigned akc_alignment_granularity = 64;
173 assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
174
182 if (shader()->getNumOutstandingInvL2s() > 0) {
183 DPRINTF(GPUCommandProc,
184 "Deferring kernel launch due to outstanding L2 invalidates\n");
185 shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
186
187 return;
188 }
189
194 AMDKernelCode *akc = new AMDKernelCode;
195
210 if (!FullSystem) {
215 auto *tc = sys->threads[0];
216 SETranslatingPortProxy virt_proxy(tc);
217
218 DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
219 virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
220 sizeof(AMDKernelCode));
221
222 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
223 } else {
230 bool is_system_page = true;
231 Addr phys_addr = disp_pkt->kernel_object;
232
239 int vmid = 1;
240 unsigned tmp_bytes;
241 walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
242 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
243 is_system_page);
244
245 DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
246 " s:%d\n", disp_pkt->kernel_object, phys_addr,
247 sizeof(AMDKernelCode), is_system_page);
248
253 if (is_system_page) {
254 DPRINTF(GPUCommandProc,
255 "sending system DMA read for kernel_object\n");
256
257 auto dma_callback = new DmaVirtCallback<uint32_t>(
258 [=](const uint32_t&) {
259 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
260 });
261
262 dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
263 dma_callback, (void *)akc);
264 } else {
265 DPRINTF(GPUCommandProc,
266 "kernel_object in device, using device mem\n");
267
268 // Read from GPU memory manager one cache line at a time to prevent
269 // rare cases where the AKC spans two memory pages.
270 ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
271 akc_alignment_granularity);
272 for (; !gen.done(); gen.next()) {
273 Addr chunk_addr = gen.addr();
274 int vmid = 1;
275 unsigned dummy;
276 walker->startFunctional(
277 gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
278 dummy, BaseMMU::Mode::Read, is_system_page);
279
281 RequestPtr request = std::make_shared<Request>(chunk_addr,
282 akc_alignment_granularity, flags,
283 walker->getDevRequestor());
284 PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);
285 readPkt->dataStatic((uint8_t *)akc + gen.complete());
286 // If the request spans two device memories, the device memory
287 // returned will be null.
288 assert(system()->getDeviceMemory(readPkt) != nullptr);
289 struct KernelDispatchData dispatchData;
290 dispatchData.akc = akc;
291 dispatchData.raw_pkt = raw_pkt;
292 dispatchData.queue_id = queue_id;
293 dispatchData.host_pkt_addr = host_pkt_addr;
294 dispatchData.readPkt = readPkt;
295 kernelDispatchList.push_back(dispatchData);
296 performTimingRead(readPkt,
297 ComputeUnit::SQCPort::SenderState::DISPATCH_KERNEL_OBJECT);
298 }
299 }
300 }
301}
302
303void
305 uint32_t queue_id, Addr host_pkt_addr)
306{
308
315 if (akc->kernarg_preload_spec_length != 0) {
317 }
318
319 sanityCheckAKC(akc);
320
321 DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
322 "kernel object\n", akc->kernel_code_entry_byte_offset);
323
324 Addr machine_code_addr = (Addr)disp_pkt->kernel_object
326
327 DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
328 machine_code_addr);
329
330 std::string kernel_name;
331
341 bool is_blit_kernel;
342 if (!disp_pkt->completion_signal) {
343 kernel_name = "Some kernel";
344 is_blit_kernel = false;
345 } else {
346 kernel_name = "Blit kernel";
347 is_blit_kernel = true;
348 }
349
350 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
351
352 GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
353 : driver()->getGfxVersion();
354 HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
355 dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
356 gfxVersion);
357
358 // The driver expects the start time to be in ns
359 Tick start_ts = curTick() / sim_clock::as_int::ns;
360 dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
361
362 // Potentially skip a non-blit kernel
363 if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
364 DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
366
367 // Notify the HSA PP that this kernel is complete
368 hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
369 if (task->completionSignal()) {
370 DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
371 "signal! Addr: %d\n", task->completionSignal());
372
374 } else {
375 DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
376 "signal\n");
377 }
378
381
382 delete akc;
383
384 // Notify the run script that a kernel has been skipped
385 exitSimLoop("Skipping GPU Kernel");
386
387 return;
388 }
389
390 DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
391 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
392 "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
393 disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
394 disp_pkt->grid_size_x, disp_pkt->grid_size_y,
395 disp_pkt->grid_size_z, disp_pkt->kernarg_address,
396 disp_pkt->completion_signal);
397
398 DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
399 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
400 "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
401 task->numScalarRegs(), task->codeAddr(), 0, 0);
402
403 if (akc->kernarg_preload_spec_length == 0) {
404 initABI(task);
405
406 delete akc;
407 } else {
408 readPreload(akc, task);
409 }
410
412 if (!is_blit_kernel) ++non_blit_kernel_id;
413}
414
415void
417{
418 // Originally the completion signal was read functionally and written
419 // with a timing DMA. This can cause issues in FullSystem mode and
420 // cause translation failures. Therefore, in FullSystem mode everything
421 // is done in timing mode.
422
423 if (!FullSystem) {
430 uint64_t signal_value = functionalReadHsaSignal(signal_handle);
431
432 updateHsaSignal(signal_handle, signal_value - 1);
433 } else {
434 // The semantics of the HSA signal is to decrement the current
435 // signal value by one. Do this asynchronously via DMAs and
436 // callbacks as we can safely continue with this function
437 // while waiting for the next packet from the host.
438 updateHsaSignalAsync(signal_handle, -1);
439 }
440}
441
442void
444{
445 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
446 uint64_t *mailboxValue = new uint64_t;
447 auto cb2 = new DmaVirtCallback<uint64_t>(
448 [ = ] (const uint64_t &)
449 { updateHsaMailboxData(signal_handle, mailboxValue); });
450 dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue);
451 DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n",
452 mailbox_addr);
453}
454
455void
457 uint64_t *mailbox_value)
458{
459 Addr event_addr = getHsaSignalEventAddr(signal_handle);
460
461 DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value);
462 if (*mailbox_value != 0) {
463 // This is an interruptible signal. Now, read the
464 // event ID and directly communicate with the driver
465 // about that event notification.
466 auto cb = new DmaVirtCallback<uint64_t>(
467 [ = ] (const uint64_t &)
468 { updateHsaEventData(signal_handle, mailbox_value); });
469 dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
470 } else {
471 delete mailbox_value;
472
473 Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
474
475 amd_event_t *event_ts = new amd_event_t;
476 event_ts->start_ts = dispatchStartTime[signal_handle];
477 event_ts->end_ts = curTick() / sim_clock::as_int::ns;
478 auto cb = new DmaVirtCallback<uint64_t>(
479 [ = ] (const uint64_t &)
480 { updateHsaEventTs(signal_handle, event_ts); });
481 dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts);
482 DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr "
483 "%lx\n", ts_addr);
484
485 dispatchStartTime.erase(signal_handle);
486 }
487}
488
489void
491 uint64_t *event_value)
492{
493 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
494
495 DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value);
496 // Write *event_value to the mailbox to clear the event
497 auto cb = new DmaVirtCallback<uint64_t>(
498 [ = ] (const uint64_t &)
499 { updateHsaSignalDone(event_value); }, *event_value);
500 dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
501
502 Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
503
504 amd_event_t *event_ts = new amd_event_t;
505 event_ts->start_ts = dispatchStartTime[signal_handle];
506 event_ts->end_ts = curTick() / sim_clock::as_int::ns;
507 auto cb2 = new DmaVirtCallback<uint64_t>(
508 [ = ] (const uint64_t &)
509 { updateHsaEventTs(signal_handle, event_ts); });
510 dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts);
511 DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n",
512 ts_addr);
513
514 dispatchStartTime.erase(signal_handle);
515}
516
517void
520{
521 delete ts;
522
523 Addr value_addr = getHsaSignalValueAddr(signal_handle);
524 int64_t diff = -1;
525
526 uint64_t *signalValue = new uint64_t;
527 auto cb = new DmaVirtCallback<uint64_t>(
528 [ = ] (const uint64_t &)
529 { updateHsaSignalData(value_addr, diff, signalValue); });
530 dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
531 DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
532 value_addr);
533}
534
535void
537 uint64_t *prev_value)
538{
539 // Reuse the value allocated for the read
540 DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
541 *prev_value, *prev_value + diff);
542 *prev_value += diff;
543 auto cb = new DmaVirtCallback<uint64_t>(
544 [ = ] (const uint64_t &)
545 { updateHsaSignalDone(prev_value); });
546 dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
547}
548
549void
551{
552 delete signal_value;
553}
554
555uint64_t
557{
558 Addr value_addr = getHsaSignalValueAddr(signal_handle);
559 auto tc = system()->threads[0];
560 ConstVPtr<Addr> prev_value(value_addr, tc);
561 return *prev_value;
562}
563
564void
565GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
567{
568 // The signal value is aligned 8 bytes from
569 // the actual handle in the runtime
570 Addr value_addr = getHsaSignalValueAddr(signal_handle);
571 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
572 Addr event_addr = getHsaSignalEventAddr(signal_handle);
573 DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
574
575 auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);
576
577 dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
578
579 auto tc = system()->threads[0];
580 ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
581
582 // Notifying an event with its mailbox pointer is
583 // not supported in the current implementation. Just use
584 // mailbox pointer to distinguish between interruptible
585 // and default signal. Interruptible signal will have
586 // a valid mailbox pointer.
587 if (*mailbox_ptr != 0) {
588 // This is an interruptible signal. Now, read the
589 // event ID and directly communicate with the driver
590 // about that event notification.
591 ConstVPtr<uint32_t> event_val(event_addr, tc);
592
593 DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
594 "signal event value %d\n", *event_val);
595
596 // The mailbox/wakeup signal uses the SE mode proxy port to write
597 // the event value. This is not available in full system mode so
598 // instead we need to issue a DMA write to the address. The value of
599 // *event_val clears the event.
600 if (FullSystem) {
601 auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
602 dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
603 } else {
604 signalWakeupEvent(*event_val);
605 }
606 }
607}
608
609void
611{
612 fatal_if(_driver, "Should not overwrite driver.");
613 // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
614 // Should get rid of the base class.
615 _driver = gpu_driver;
616 assert(_driver);
617}
618
621{
622 return _driver;
623}
624
632
643void
644GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
645 Addr host_pkt_addr)
646{
647 auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;
648
649 if (vendor_pkt->completion_signal) {
650 sendCompletionSignal(vendor_pkt->completion_signal);
651 }
652
653 warn("Ignoring vendor packet\n");
654
655 hsaPP->finishPkt(raw_pkt, queue_id);
656}
657
665void
666GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
667 Addr host_pkt_addr)
668{
669 //Parse the Packet, see what it wants us to do
670 _hsa_agent_dispatch_packet_t * agent_pkt =
672
673 if (agent_pkt->type == AgentCmd::Nop) {
674 DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
675 } else if (agent_pkt->type == AgentCmd::Steal) {
676 //This is where we steal the HSA Task's completion signal
677 int kid = agent_pkt->arg[0];
678 DPRINTF(GPUCommandProc,
679 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
680 kid);
681
682 HSAQueueEntry *task = dispatcher.hsaTask(kid);
683 uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
684
685 uint64_t return_address = agent_pkt->return_address;
686 DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
687 //*return_address = signal_addr;
688 Addr *new_signal_addr = new Addr;
689 *new_signal_addr = (Addr)signal_addr;
690 dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
691
692 DPRINTF(GPUCommandProc,
693 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
694 "(%x:%x) writing into %x\n",
695 kid,signal_addr,new_signal_addr,return_address);
696
697 } else
698 {
699 panic("The agent dispatch packet provided an unknown argument in" \
700 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
701 }
702
703 hsaPP->finishPkt(raw_pkt, queue_id);
704}
705
712void
714{
715 dispatcher.dispatch(task);
716}
717
718void
720{
721 _driver->signalWakeupEvent(event_id);
722}
723
724void
726{
727 _hsa_dispatch_packet_t *disp_pkt =
729
730 // Data preloaded is copied from the kernarg segment. Preloading starts at
731 // the dword offset specified by kernarg_preload_spec_offset.
732 Addr preload_addr = (Addr)disp_pkt->kernarg_address
734
735 DPRINTF(GPUCommandProc, "Kernarg preload starts at addr: %#x\n",
736 preload_addr);
737
744 bool is_system_page = true;
745 Addr phys_addr = preload_addr;
746
753 int vmid = 1;
754 unsigned tmp_bytes;
755 walker->startFunctional(gpuDevice->getVM().getPageTableBase(vmid),
756 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
757 is_system_page);
758
759 DPRINTF(GPUCommandProc, "Kernarg preload data is in %s memory\n",
760 is_system_page ? "host" : "device");
761
766 if (is_system_page) {
767 // Unclear if this is even possible as the point of kernarg preload
768 // is to avoid loads from host memory by explicitly placing them in
769 // device memory. It is not difficult to implement so issue a warning
770 // for now to indicate a possible place to debug if something goes
771 // wrong and this warning is seen.
772 warn("Preload kernarg from host untested!\n");
773
774 auto cb = new DmaVirtCallback<uint32_t>(
775 [ = ] (const uint32_t&) {
776 initPreload(akc, task);
777 });
778
779 dmaReadVirt(preload_addr,
780 sizeof(uint32_t) * akc->kernarg_preload_spec_length,
781 cb, task->preloadArgs());
782 } else {
783 // Read from GPU memory manager one cache line at a time to prevent
784 // rare cases where the preload data spans two memory pages.
785 constexpr unsigned alignment_granularity = 64;
786 ChunkGenerator gen(preload_addr,
787 sizeof(uint32_t) * akc->kernarg_preload_spec_length,
788 alignment_granularity);
789
790 for (; !gen.done(); gen.next()) {
791 Addr chunk_addr = gen.addr();
792 int vmid = 1;
793 unsigned dummy;
794 walker->startFunctional(
795 gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
796 dummy, BaseMMU::Mode::Read, is_system_page);
797
799 RequestPtr request = std::make_shared<Request>(chunk_addr,
800 alignment_granularity, flags,
801 walker->getDevRequestor());
802
803 PacketPtr readPkt = new Packet(request, MemCmd::ReadReq);
804 readPkt->dataStatic((uint8_t *)task->preloadArgs()
805 + gen.complete());
806
807 struct KernelDispatchData dispatchData;
808 dispatchData.akc = akc;
809 dispatchData.task = task;
810 dispatchData.readPkt = readPkt;
811 kernelDispatchList.push_back(dispatchData);
812 performTimingRead(readPkt,
813 ComputeUnit::SQCPort::SenderState::DISPATCH_PRELOAD_ARG);
814 }
815 }
816}
817
818void
820{
821 // Fill in SGPRs
822 int num_sgprs = akc->kernarg_preload_spec_length;
823
824 task->preloadLength(num_sgprs);
825 for (int i = 0; i < num_sgprs; ++i) {
826 DPRINTF(GPUCommandProc, "Task preload user SGPR[%d] = %x\n",
827 i, task->preloadArgs()[i]);
828 }
829
830 delete akc;
831
832 initABI(task);
833}
834
841void
843{
844 auto cb = new DmaVirtCallback<uint32_t>(
845 [ = ] (const uint32_t &readDispIdOffset)
846 { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
847
848 Addr hostReadIdxPtr
849 = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
850
851 dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
852 sizeof(uint32_t), cb, &cb->dmaBuffer);
853}
854
855void
857{
858 DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
860 DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
862 DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
863 DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
865 DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
866 DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
867 DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
869 DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
871 DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
872 DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
873 DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
875 DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
877 DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
879 DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
880 DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
881 DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
882 DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
883 DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
884 DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
885 DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
886 DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
887 DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
888 DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
889 DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
891 DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
892 DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
893 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
895 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
897 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
899 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
901 DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
903 DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
905 DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
907 DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
908 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
910 DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
912 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
914 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
916 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
918 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
920 DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
922 DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
924 DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
926 DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
928 DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
930 DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
932 DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
934 DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
936 DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
938 DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
939 DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
941 DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
943
944
945 // Check for features not implemented in gem5
946 fatal_if(akc->wgp_mode, "WGP mode not supported\n");
947 fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
948 fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
949
950
951 // Warn on features that gem5 will ignore
952 warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
953 warn_if(akc->bulky, "Bulky code object bit ignored\n");
954 // TODO: All the IEEE bits
955
956 warn_if(akc->tg_split, "TG split not implemented\n");
957}
958
959System*
961{
962 return sys;
963}
964
967{
968 AddrRangeList ranges;
969 return ranges;
970}
971
972void
974{
975 gpuDevice = gpu_device;
976 walker->setDevRequestor(gpuDevice->vramRequestorId());
977}
978
979void
984
985Shader*
987{
988 return _shader;
989}
990
991GfxVersion
993{
994 return FullSystem ? gpuDevice->getGfxVersion() : _driver->getGfxVersion();
995}
996
997} // namespace gem5
AbstractMemory declaration.
#define DPRINTF(x,...)
Definition trace.hh:209
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
std::deque< std::pair< PacketPtr, Wavefront * > > retries
std::vector< std::vector< Wavefront * > > wfList
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
DmaVirtDevice(const Params &p)
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Addr getHsaSignalMailboxAddr(Addr signal_handle)
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void updateHsaSignalDone(uint64_t *signal_value)
HSAPacketProcessor & hsaPacketProc()
void performTimingRead(PacketPtr pkt, int dispType)
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
std::list< struct KernelDispatchData > kernelDispatchList
Addr getHsaSignalValueAddr(Addr signal_handle)
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
std::unordered_map< Addr, Tick > dispatchStartTime
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void sanityCheckAKC(AMDKernelCode *akc)
void initPreload(AMDKernelCode *akc, HSAQueueEntry *task)
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
void readPreload(AMDKernelCode *akc, HSAQueueEntry *task)
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
GfxVersion getGfxVersion() const
void finishPkt(void *pkt, uint32_t rl_idx)
uint32_t queueId() const
void preloadLength(unsigned val)
Addr completionSignal() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
SenderState * senderState
This packet's sender state.
Definition packet.hh:545
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
Definition port.hh:603
@ PHYSICAL
The virtual address is also the physical address.
Definition request.hh:117
gem5::Flags< FlagsType > Flags
Definition request.hh:102
std::vector< ComputeUnit * > cuList
Definition shader.hh:269
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:576
Threads threads
Definition system.hh:315
STL pair class.
Definition stl.hh:58
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:268
#define warn(...)
Definition logging.hh:288
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:315
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 55, 52 > ts
Bitfield< 0 > p
Tick ns
nanosecond
Definition core.cc:68
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74
struct gem5::GEM5_PACKED AMDKernelCode
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
The "old style" exitSimLoop functions.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
uint16_t RequestorID
Definition request.hh:95
Packet * PacketPtr
constexpr int KernargPreloadPktSize
The number of bytes after the dispatch packet which contain kernel arguments that should be preloaded...
std::unique_ptr< TranslationGen > TranslationGenPtr
ConstProxyPtr< T, SETranslatingPortProxy > ConstVPtr
Definition proxy_ptr.hh:398
struct gem5::amd_signal_s amd_signal_t
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_exception_memory
uint32_t enable_exception_fp_denormal_source
uint32_t enable_exception_address_watch
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_exception_ieee_754_fp_inexact
uint32_t enable_ieee_mode
uint32_t group_segment_fixed_size
uint32_t float_mode_round_32
uint32_t enable_exception_int_divide_by_zero
uint32_t granulated_wavefront_sgpr_count
uint32_t granulated_workitem_vgpr_count
uint32_t kernarg_preload_spec_length
uint32_t enable_sgpr_dispatch_ptr
uint32_t use_dynamic_stack
uint32_t granulated_lds_size
uint32_t float_mode_round_16_64
uint32_t float_mode_denorm_16_64
uint32_t float_mode_denorm_32
uint32_t kernarg_preload_spec_offset
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t accum_offset
uint32_t user_sgpr_count
uint32_t enable_dx10_clamp
uint32_t enable_exception_ieee_754_fp_overflow
int64_t kernel_code_entry_byte_offset
uint32_t enable_exception_ieee_754_fp_underflow
uint32_t enable_vgpr_workitem_id
uint32_t kernarg_size
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_exception_ieee_754_fp_division_by_zero
uint32_t private_segment_fixed_size
uint32_t enable_trap_handler
uint32_t enable_sgpr_workgroup_info
uint32_t enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_wavefront_size32
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Mon May 26 2025 09:19:10 for gem5 by doxygen 1.13.2