gem5 v24.0.0.0
Loading...
Searching...
No Matches
gpu_command_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <cassert>
35
38#include "debug/GPUCommandProc.hh"
39#include "debug/GPUDisp.hh"
40#include "debug/GPUInitAbi.hh"
41#include "debug/GPUKernelInfo.hh"
44#include "gpu-compute/shader.hh"
45#include "mem/abstract_mem.hh"
46#include "mem/packet_access.hh"
49#include "params/GPUCommandProcessor.hh"
50#include "sim/full_system.hh"
51#include "sim/process.hh"
52#include "sim/proxy_ptr.hh"
53#include "sim/sim_exit.hh"
55
56namespace gem5
57{
58
60 : DmaVirtDevice(p), dispatcher(*p.dispatcher), _driver(nullptr),
61 walker(p.walker), hsaPP(p.hsapp),
62 target_non_blit_kernel_id(p.target_non_blit_kernel_id)
63{
64 assert(hsaPP);
65 hsaPP->setDevice(this);
67}
68
74
83
86{
87 if (!FullSystem) {
88 // Grab the process and try to translate the virtual address with it;
89 // with new extensions, it will likely be wrong to just arbitrarily
90 // grab context zero.
91 auto process = sys->threads[0]->getProcessPtr();
92
93 return process->pTable->translateRange(vaddr, size);
94 }
95
96 // In full system use the page tables setup by the kernel driver rather
97 // than the CPU page tables.
98 return TranslationGenPtr(
100 1 /* vmid */, vaddr, size));
101}
102
120void
121GPUCommandProcessor::submitDispatchPkt(void *raw_pkt, uint32_t queue_id,
122 Addr host_pkt_addr)
123{
125 // The kernel object should be aligned to a 64B boundary, but not
126 // necessarily a cache line boundary.
127 unsigned akc_alignment_granularity = 64;
128 assert(!(disp_pkt->kernel_object & (akc_alignment_granularity - 1)));
129
137 if (shader()->getNumOutstandingInvL2s() > 0) {
138 DPRINTF(GPUCommandProc,
139 "Deferring kernel launch due to outstanding L2 invalidates\n");
140 shader()->addDeferredDispatch(raw_pkt, queue_id, host_pkt_addr);
141
142 return;
143 }
144
149 AMDKernelCode *akc = new AMDKernelCode;
150
165 if (!FullSystem) {
170 auto *tc = sys->threads[0];
171 SETranslatingPortProxy virt_proxy(tc);
172
173 DPRINTF(GPUCommandProc, "reading kernel_object using proxy\n");
174 virt_proxy.readBlob(disp_pkt->kernel_object, (uint8_t*)akc,
175 sizeof(AMDKernelCode));
176
177 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
178 } else {
185 bool is_system_page = true;
186 Addr phys_addr = disp_pkt->kernel_object;
187
194 int vmid = 1;
195 unsigned tmp_bytes;
197 phys_addr, tmp_bytes, BaseMMU::Mode::Read,
198 is_system_page);
199
200 DPRINTF(GPUCommandProc, "kernel_object vaddr %#lx paddr %#lx size %d"
201 " s:%d\n", disp_pkt->kernel_object, phys_addr,
202 sizeof(AMDKernelCode), is_system_page);
203
208 if (is_system_page) {
209 DPRINTF(GPUCommandProc,
210 "sending system DMA read for kernel_object\n");
211
212 auto dma_callback = new DmaVirtCallback<uint32_t>(
213 [=](const uint32_t&) {
214 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
215 });
216
217 dmaReadVirt(disp_pkt->kernel_object, sizeof(AMDKernelCode),
218 dma_callback, (void *)akc);
219 } else {
220 DPRINTF(GPUCommandProc,
221 "kernel_object in device, using device mem\n");
222
223 // Read from GPU memory manager one cache line at a time to prevent
224 // rare cases where the AKC spans two memory pages.
225 ChunkGenerator gen(disp_pkt->kernel_object, sizeof(AMDKernelCode),
226 akc_alignment_granularity);
227 for (; !gen.done(); gen.next()) {
228 Addr chunk_addr = gen.addr();
229 int vmid = 1;
230 unsigned dummy;
232 gpuDevice->getVM().getPageTableBase(vmid), chunk_addr,
233 dummy, BaseMMU::Mode::Read, is_system_page);
234
236 RequestPtr request = std::make_shared<Request>(chunk_addr,
237 akc_alignment_granularity, flags,
239 Packet *readPkt = new Packet(request, MemCmd::ReadReq);
240 readPkt->dataStatic((uint8_t *)akc + gen.complete());
241 // If the request spans two device memories, the device memory
242 // returned will be null.
243 assert(system()->getDeviceMemory(readPkt) != nullptr);
244 system()->getDeviceMemory(readPkt)->access(readPkt);
245 delete readPkt;
246 }
247
248 dispatchKernelObject(akc, raw_pkt, queue_id, host_pkt_addr);
249 }
250 }
251}
252
253void
255 uint32_t queue_id, Addr host_pkt_addr)
256{
258
259 sanityCheckAKC(akc);
260
261 DPRINTF(GPUCommandProc, "GPU machine code is %lli bytes from start of the "
262 "kernel object\n", akc->kernel_code_entry_byte_offset);
263
264 Addr machine_code_addr = (Addr)disp_pkt->kernel_object
266
267 DPRINTF(GPUCommandProc, "Machine code starts at addr: %#x\n",
268 machine_code_addr);
269
270 std::string kernel_name;
271
281 bool is_blit_kernel;
282 if (!disp_pkt->completion_signal) {
283 kernel_name = "Some kernel";
284 is_blit_kernel = false;
285 } else {
286 kernel_name = "Blit kernel";
287 is_blit_kernel = true;
288 }
289
290 DPRINTF(GPUKernelInfo, "Kernel name: %s\n", kernel_name.c_str());
291
292 GfxVersion gfxVersion = FullSystem ? gpuDevice->getGfxVersion()
293 : driver()->getGfxVersion();
294 HSAQueueEntry *task = new HSAQueueEntry(kernel_name, queue_id,
295 dynamic_task_id, raw_pkt, akc, host_pkt_addr, machine_code_addr,
296 gfxVersion);
297
298 // The driver expects the start time to be in ns
299 Tick start_ts = curTick() / sim_clock::as_int::ns;
300 dispatchStartTime.insert({disp_pkt->completion_signal, start_ts});
301
302 // Potentially skip a non-blit kernel
303 if (!is_blit_kernel && (non_blit_kernel_id < target_non_blit_kernel_id)) {
304 DPRINTF(GPUCommandProc, "Skipping non-blit kernel %i (Task ID: %i)\n",
306
307 // Notify the HSA PP that this kernel is complete
308 hsaPacketProc().finishPkt(task->dispPktPtr(), task->queueId());
309 if (task->completionSignal()) {
310 DPRINTF(GPUDisp, "HSA AQL Kernel Complete with completion "
311 "signal! Addr: %d\n", task->completionSignal());
312
314 } else {
315 DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "
316 "signal\n");
317 }
318
321
322 delete akc;
323
324 // Notify the run script that a kernel has been skipped
325 exitSimLoop("Skipping GPU Kernel");
326
327 return;
328 }
329
330 DPRINTF(GPUCommandProc, "Task ID: %i Got AQL: wg size (%dx%dx%d), "
331 "grid size (%dx%dx%d) kernarg addr: %#x, completion "
332 "signal addr:%#x\n", dynamic_task_id, disp_pkt->workgroup_size_x,
333 disp_pkt->workgroup_size_y, disp_pkt->workgroup_size_z,
334 disp_pkt->grid_size_x, disp_pkt->grid_size_y,
335 disp_pkt->grid_size_z, disp_pkt->kernarg_address,
336 disp_pkt->completion_signal);
337
338 DPRINTF(GPUCommandProc, "Extracted code object: %s (num vector regs: %d, "
339 "num scalar regs: %d, code addr: %#x, kernarg size: %d, "
340 "LDS size: %d)\n", kernel_name, task->numVectorRegs(),
341 task->numScalarRegs(), task->codeAddr(), 0, 0);
342
343 initABI(task);
345 if (!is_blit_kernel) ++non_blit_kernel_id;
346
347 delete akc;
348}
349
350void
352{
353 // Originally the completion signal was read functionally and written
354 // with a timing DMA. This can cause issues in FullSystem mode and
355 // cause translation failures. Therefore, in FullSystem mode everything
356 // is done in timing mode.
357
358 if (!FullSystem) {
365 uint64_t signal_value = functionalReadHsaSignal(signal_handle);
366
367 updateHsaSignal(signal_handle, signal_value - 1);
368 } else {
369 // The semantics of the HSA signal is to decrement the current
370 // signal value by one. Do this asynchronously via DMAs and
371 // callbacks as we can safely continue with this function
372 // while waiting for the next packet from the host.
373 updateHsaSignalAsync(signal_handle, -1);
374 }
375}
376
377void
379{
380 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
381 uint64_t *mailboxValue = new uint64_t;
382 auto cb2 = new DmaVirtCallback<uint64_t>(
383 [ = ] (const uint64_t &)
384 { updateHsaMailboxData(signal_handle, mailboxValue); });
385 dmaReadVirt(mailbox_addr, sizeof(uint64_t), cb2, (void *)mailboxValue);
386 DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading mailbox addr %lx\n",
387 mailbox_addr);
388}
389
390void
392 uint64_t *mailbox_value)
393{
394 Addr event_addr = getHsaSignalEventAddr(signal_handle);
395
396 DPRINTF(GPUCommandProc, "updateHsaMailboxData read %ld\n", *mailbox_value);
397 if (*mailbox_value != 0) {
398 // This is an interruptible signal. Now, read the
399 // event ID and directly communicate with the driver
400 // about that event notification.
401 auto cb = new DmaVirtCallback<uint64_t>(
402 [ = ] (const uint64_t &)
403 { updateHsaEventData(signal_handle, mailbox_value); });
404 dmaReadVirt(event_addr, sizeof(uint64_t), cb, (void *)mailbox_value);
405 } else {
406 delete mailbox_value;
407
408 Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
409
410 amd_event_t *event_ts = new amd_event_t;
411 event_ts->start_ts = dispatchStartTime[signal_handle];
412 event_ts->end_ts = curTick() / sim_clock::as_int::ns;
413 auto cb = new DmaVirtCallback<uint64_t>(
414 [ = ] (const uint64_t &)
415 { updateHsaEventTs(signal_handle, event_ts); });
416 dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb, (void *)event_ts);
417 DPRINTF(GPUCommandProc, "updateHsaMailboxData reading timestamp addr "
418 "%lx\n", ts_addr);
419
420 dispatchStartTime.erase(signal_handle);
421 }
422}
423
424void
426 uint64_t *event_value)
427{
428 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
429
430 DPRINTF(GPUCommandProc, "updateHsaEventData read %ld\n", *event_value);
431 // Write *event_value to the mailbox to clear the event
432 auto cb = new DmaVirtCallback<uint64_t>(
433 [ = ] (const uint64_t &)
434 { updateHsaSignalDone(event_value); }, *event_value);
435 dmaWriteVirt(mailbox_addr, sizeof(uint64_t), cb, &cb->dmaBuffer, 0);
436
437 Addr ts_addr = signal_handle + offsetof(amd_signal_t, start_ts);
438
439 amd_event_t *event_ts = new amd_event_t;
440 event_ts->start_ts = dispatchStartTime[signal_handle];
441 event_ts->end_ts = curTick() / sim_clock::as_int::ns;
442 auto cb2 = new DmaVirtCallback<uint64_t>(
443 [ = ] (const uint64_t &)
444 { updateHsaEventTs(signal_handle, event_ts); });
445 dmaWriteVirt(ts_addr, sizeof(amd_event_t), cb2, (void *)event_ts);
446 DPRINTF(GPUCommandProc, "updateHsaEventData reading timestamp addr %lx\n",
447 ts_addr);
448
449 dispatchStartTime.erase(signal_handle);
450}
451
452void
455{
456 delete ts;
457
458 Addr value_addr = getHsaSignalValueAddr(signal_handle);
459 int64_t diff = -1;
460
461 uint64_t *signalValue = new uint64_t;
462 auto cb = new DmaVirtCallback<uint64_t>(
463 [ = ] (const uint64_t &)
464 { updateHsaSignalData(value_addr, diff, signalValue); });
465 dmaReadVirt(value_addr, sizeof(uint64_t), cb, (void *)signalValue);
466 DPRINTF(GPUCommandProc, "updateHsaSignalAsync reading value addr %lx\n",
467 value_addr);
468}
469
470void
472 uint64_t *prev_value)
473{
474 // Reuse the value allocated for the read
475 DPRINTF(GPUCommandProc, "updateHsaSignalData read %ld, writing %ld\n",
476 *prev_value, *prev_value + diff);
477 *prev_value += diff;
478 auto cb = new DmaVirtCallback<uint64_t>(
479 [ = ] (const uint64_t &)
480 { updateHsaSignalDone(prev_value); });
481 dmaWriteVirt(value_addr, sizeof(uint64_t), cb, (void *)prev_value);
482}
483
484void
486{
487 delete signal_value;
488}
489
490uint64_t
492{
493 Addr value_addr = getHsaSignalValueAddr(signal_handle);
494 auto tc = system()->threads[0];
495 ConstVPtr<Addr> prev_value(value_addr, tc);
496 return *prev_value;
497}
498
499void
500GPUCommandProcessor::updateHsaSignal(Addr signal_handle, uint64_t signal_value,
502{
503 // The signal value is aligned 8 bytes from
504 // the actual handle in the runtime
505 Addr value_addr = getHsaSignalValueAddr(signal_handle);
506 Addr mailbox_addr = getHsaSignalMailboxAddr(signal_handle);
507 Addr event_addr = getHsaSignalEventAddr(signal_handle);
508 DPRINTF(GPUCommandProc, "Triggering completion signal: %x!\n", value_addr);
509
510 auto cb = new DmaVirtCallback<uint64_t>(function, signal_value);
511
512 dmaWriteVirt(value_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
513
514 auto tc = system()->threads[0];
515 ConstVPtr<uint64_t> mailbox_ptr(mailbox_addr, tc);
516
517 // Notifying an event with its mailbox pointer is
518 // not supported in the current implementation. Just use
519 // mailbox pointer to distinguish between interruptible
520 // and default signal. Interruptible signal will have
521 // a valid mailbox pointer.
522 if (*mailbox_ptr != 0) {
523 // This is an interruptible signal. Now, read the
524 // event ID and directly communicate with the driver
525 // about that event notification.
526 ConstVPtr<uint32_t> event_val(event_addr, tc);
527
528 DPRINTF(GPUCommandProc, "Calling signal wakeup event on "
529 "signal event value %d\n", *event_val);
530
531 // The mailbox/wakeup signal uses the SE mode proxy port to write
532 // the event value. This is not available in full system mode so
533 // instead we need to issue a DMA write to the address. The value of
534 // *event_val clears the event.
535 if (FullSystem) {
536 auto cb = new DmaVirtCallback<uint64_t>(function, *event_val);
537 dmaWriteVirt(mailbox_addr, sizeof(Addr), cb, &cb->dmaBuffer, 0);
538 } else {
539 signalWakeupEvent(*event_val);
540 }
541 }
542}
543
544void
546{
547 fatal_if(_driver, "Should not overwrite driver.");
548 // TODO: GPU Driver inheritance hierarchy doesn't really make sense.
549 // Should get rid of the base class.
550 _driver = gpu_driver;
551 assert(_driver);
552}
553
556{
557 return _driver;
558}
559
578void
579GPUCommandProcessor::submitVendorPkt(void *raw_pkt, uint32_t queue_id,
580 Addr host_pkt_addr)
581{
582 auto vendor_pkt = (_hsa_generic_vendor_pkt *)raw_pkt;
583
584 if (vendor_pkt->completion_signal) {
585 sendCompletionSignal(vendor_pkt->completion_signal);
586 }
587
588 warn("Ignoring vendor packet\n");
589
590 hsaPP->finishPkt(raw_pkt, queue_id);
591}
592
600void
601GPUCommandProcessor::submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id,
602 Addr host_pkt_addr)
603{
604 //Parse the Packet, see what it wants us to do
605 _hsa_agent_dispatch_packet_t * agent_pkt =
607
608 if (agent_pkt->type == AgentCmd::Nop) {
609 DPRINTF(GPUCommandProc, "Agent Dispatch Packet NOP\n");
610 } else if (agent_pkt->type == AgentCmd::Steal) {
611 //This is where we steal the HSA Task's completion signal
612 int kid = agent_pkt->arg[0];
613 DPRINTF(GPUCommandProc,
614 "Agent Dispatch Packet Stealing signal handle for kernel %d\n",
615 kid);
616
617 HSAQueueEntry *task = dispatcher.hsaTask(kid);
618 uint64_t signal_addr = task->completionSignal();// + sizeof(uint64_t);
619
620 uint64_t return_address = agent_pkt->return_address;
621 DPRINTF(GPUCommandProc, "Return Addr: %p\n",return_address);
622 //*return_address = signal_addr;
623 Addr *new_signal_addr = new Addr;
624 *new_signal_addr = (Addr)signal_addr;
625 dmaWriteVirt(return_address, sizeof(Addr), nullptr, new_signal_addr, 0);
626
627 DPRINTF(GPUCommandProc,
628 "Agent Dispatch Packet Stealing signal handle from kid %d :" \
629 "(%x:%x) writing into %x\n",
630 kid,signal_addr,new_signal_addr,return_address);
631
632 } else
633 {
634 panic("The agent dispatch packet provided an unknown argument in" \
635 "arg[0],currently only 0(nop) or 1(return kernel signal) is accepted");
636 }
637
638 hsaPP->finishPkt(raw_pkt, queue_id);
639}
640
647void
652
653void
655{
656 _driver->signalWakeupEvent(event_id);
657}
658
665void
667{
668 auto cb = new DmaVirtCallback<uint32_t>(
669 [ = ] (const uint32_t &readDispIdOffset)
670 { ReadDispIdOffsetDmaEvent(task, readDispIdOffset); }, 0);
671
672 Addr hostReadIdxPtr
673 = hsaPP->getQueueDesc(task->queueId())->hostReadIndexPtr;
674
675 dmaReadVirt(hostReadIdxPtr + sizeof(hostReadIdxPtr),
676 sizeof(uint32_t), cb, &cb->dmaBuffer);
677}
678
679void
681{
682 DPRINTF(GPUInitAbi, "group_segment_fixed_size: %d\n",
684 DPRINTF(GPUInitAbi, "private_segment_fixed_size: %d\n",
686 DPRINTF(GPUInitAbi, "kernarg_size: %d\n", akc->kernarg_size);
687 DPRINTF(GPUInitAbi, "kernel_code_entry_byte_offset: %d\n",
689 DPRINTF(GPUInitAbi, "accum_offset: %d\n", akc->accum_offset);
690 DPRINTF(GPUInitAbi, "tg_split: %d\n", akc->tg_split);
691 DPRINTF(GPUInitAbi, "granulated_workitem_vgpr_count: %d\n",
693 DPRINTF(GPUInitAbi, "granulated_wavefront_sgpr_count: %d\n",
695 DPRINTF(GPUInitAbi, "priority: %d\n", akc->priority);
696 DPRINTF(GPUInitAbi, "float_mode_round_32: %d\n", akc->float_mode_round_32);
697 DPRINTF(GPUInitAbi, "float_mode_round_16_64: %d\n",
699 DPRINTF(GPUInitAbi, "float_mode_denorm_32: %d\n",
701 DPRINTF(GPUInitAbi, "float_mode_denorm_16_64: %d\n",
703 DPRINTF(GPUInitAbi, "priv: %d\n", akc->priv);
704 DPRINTF(GPUInitAbi, "enable_dx10_clamp: %d\n", akc->enable_dx10_clamp);
705 DPRINTF(GPUInitAbi, "debug_mode: %d\n", akc->debug_mode);
706 DPRINTF(GPUInitAbi, "enable_ieee_mode: %d\n", akc->enable_ieee_mode);
707 DPRINTF(GPUInitAbi, "bulky: %d\n", akc->bulky);
708 DPRINTF(GPUInitAbi, "cdbg_user: %d\n", akc->cdbg_user);
709 DPRINTF(GPUInitAbi, "fp16_ovfl: %d\n", akc->fp16_ovfl);
710 DPRINTF(GPUInitAbi, "wgp_mode: %d\n", akc->wgp_mode);
711 DPRINTF(GPUInitAbi, "mem_ordered: %d\n", akc->mem_ordered);
712 DPRINTF(GPUInitAbi, "fwd_progress: %d\n", akc->fwd_progress);
713 DPRINTF(GPUInitAbi, "enable_private_segment: %d\n",
715 DPRINTF(GPUInitAbi, "user_sgpr_count: %d\n", akc->user_sgpr_count);
716 DPRINTF(GPUInitAbi, "enable_trap_handler: %d\n", akc->enable_trap_handler);
717 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_x: %d\n",
719 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_y: %d\n",
721 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_id_z: %d\n",
723 DPRINTF(GPUInitAbi, "enable_sgpr_workgroup_info: %d\n",
725 DPRINTF(GPUInitAbi, "enable_vgpr_workitem_id: %d\n",
727 DPRINTF(GPUInitAbi, "enable_exception_address_watch: %d\n",
729 DPRINTF(GPUInitAbi, "enable_exception_memory: %d\n",
731 DPRINTF(GPUInitAbi, "granulated_lds_size: %d\n", akc->granulated_lds_size);
732 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_invalid_operation: %d\n",
734 DPRINTF(GPUInitAbi, "enable_exception_fp_denormal_source: %d\n",
736 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_division_by_zero: %d\n",
738 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_overflow: %d\n",
740 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_underflow: %d\n",
742 DPRINTF(GPUInitAbi, "enable_exception_ieee_754_fp_inexact: %d\n",
744 DPRINTF(GPUInitAbi, "enable_exception_int_divide_by_zero: %d\n",
746 DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_buffer: %d\n",
748 DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_ptr: %d\n",
750 DPRINTF(GPUInitAbi, "enable_sgpr_queue_ptr: %d\n",
752 DPRINTF(GPUInitAbi, "enable_sgpr_kernarg_segment_ptr: %d\n",
754 DPRINTF(GPUInitAbi, "enable_sgpr_dispatch_id: %d\n",
756 DPRINTF(GPUInitAbi, "enable_sgpr_flat_scratch_init: %d\n",
758 DPRINTF(GPUInitAbi, "enable_sgpr_private_segment_size: %d\n",
760 DPRINTF(GPUInitAbi, "enable_wavefront_size32: %d\n",
762 DPRINTF(GPUInitAbi, "use_dynamic_stack: %d\n", akc->use_dynamic_stack);
763 DPRINTF(GPUInitAbi, "kernarg_preload_spec_length: %d\n",
765 DPRINTF(GPUInitAbi, "kernarg_preload_spec_offset: %d\n",
767
768
769 // Check for features not implemented in gem5
770 fatal_if(akc->wgp_mode, "WGP mode not supported\n");
771 fatal_if(akc->mem_ordered, "Memory ordering control not supported\n");
772 fatal_if(akc->fwd_progress, "Fwd_progress mode not supported\n");
773
774
775 // Warn on features that gem5 will ignore
776 warn_if(akc->fp16_ovfl, "FP16 clamp control bit ignored\n");
777 warn_if(akc->bulky, "Bulky code object bit ignored\n");
778 // TODO: All the IEEE bits
779
782 "Kernarg preload not implemented\n");
783 warn_if(akc->tg_split, "TG split not implemented\n");
784}
785
786System*
788{
789 return sys;
790}
791
794{
795 AddrRangeList ranges;
796 return ranges;
797}
798
799void
805
806void
811
812Shader*
814{
815 return _shader;
816}
817
818} // namespace gem5
AbstractMemory declaration.
#define DPRINTF(x,...)
Definition trace.hh:210
Declaration and inline definition of ChunkGenerator object.
Device model for an AMD GPU.
GfxVersion getGfxVersion() const
RequestorID vramRequestorId()
Methods related to translations and system/device memory.
Addr getPageTableBase(uint16_t vmid)
Definition amdgpu_vm.hh:283
This class takes an arbitrary memory region (address/length pair) and generates a series of appropria...
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void sendCompletionSignal(Addr signal_handle)
void submitDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitDispatchPkt() is the entry point into the CP from the HSAPP and is only meant to be used with A...
void ReadDispIdOffsetDmaEvent(HSAQueueEntry *task, const uint32_t &readDispIdOffset)
Perform a DMA read of the read_dispatch_id_field_base_byte_offset field, which follows directly after...
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from GPU device.
Addr getHsaSignalMailboxAddr(Addr signal_handle)
void setGPUDevice(AMDGPUDevice *gpu_device)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Function used to translate a range of addresses from virtual to physical addresses.
void signalWakeupEvent(uint32_t event_id)
void updateHsaSignal(Addr signal_handle, uint64_t signal_value, HsaSignalCallbackFunction function=[](const uint64_t &) { })
void updateHsaSignalDone(uint64_t *signal_value)
HSAPacketProcessor & hsaPacketProc()
void submitAgentDispatchPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitAgentDispatchPkt() is for accepting agent dispatch packets.
Addr getHsaSignalValueAddr(Addr signal_handle)
void updateHsaEventTs(Addr signal_handle, amd_event_t *event_value)
void dispatchKernelObject(AMDKernelCode *akc, void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
void attachDriver(GPUComputeDriver *driver)
void initABI(HSAQueueEntry *task)
The CP is responsible for traversing all HSA-ABI-related data structures from memory and initializing...
void updateHsaSignalAsync(Addr signal_handle, int64_t diff)
std::unordered_map< Addr, Tick > dispatchStartTime
Addr getHsaSignalEventAddr(Addr signal_handle)
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void submitVendorPkt(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
submitVendorPkt() is for accepting vendor-specific packets from the HSAPP.
void sanityCheckAKC(AMDKernelCode *akc)
GPUCommandProcessorParams Params
void dispatchPkt(HSAQueueEntry *task)
Once the CP has finished extracting all relevant information about a task and has initialized the ABI...
void updateHsaMailboxData(Addr signal_handle, uint64_t *mailbox_value)
void updateHsaEventData(Addr signal_handle, uint64_t *event_value)
std::function< void(const uint64_t &)> HsaSignalCallbackFunction
uint64_t functionalReadHsaSignal(Addr signal_handle)
void updateHsaSignalData(Addr value_addr, int64_t diff, uint64_t *prev_value)
GfxVersion getGfxVersion() const
virtual void signalWakeupEvent(uint32_t event_id)
void dispatch(HSAQueueEntry *task)
After all relevant HSA data structures have been traversed/extracted from memory by the CP,...
HSAQueueEntry * hsaTask(int disp_id)
Definition dispatcher.cc:65
void setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)
Definition dispatcher.cc:72
HSAQueueDescriptor * getQueueDesc(uint32_t queId)
void finishPkt(void *pkt, uint32_t rl_idx)
void setDevice(GPUCommandProcessor *dev)
uint32_t queueId() const
Addr completionSignal() const
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175
void readBlob(Addr addr, void *p, uint64_t size) const
Higher level interfaces based on the above.
@ PHYSICAL
The virtual address is also the physical address.
Definition request.hh:117
void addDeferredDispatch(void *raw_pkt, uint32_t queue_id, Addr host_pkt_addr)
Definition shader.cc:571
memory::AbstractMemory * getDeviceMemory(const PacketPtr &pkt) const
Return a pointer to the device memory.
Definition system.cc:311
Threads threads
Definition system.hh:310
void setDevRequestor(RequestorID mid)
Fault startFunctional(Addr base, Addr vaddr, PageTableEntry &pte, unsigned &logBytes, BaseMMU::Mode mode)
RequestorID getDevRequestor() const
void access(PacketPtr pkt)
Perform an untimed memory access and update all the state (e.g.
The GPUDispatcher is the component of the shader that is responsible for creating and dispatching WGs...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
Addr addr() const
Return starting address of current chunk.
Addr complete() const
Number of bytes we have already chunked up.
bool done() const
Are we done? That is, did the last call to next() advance past the end of the region?
bool next()
Advance generator to next chunk.
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:188
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
uint8_t flags
Definition helpers.cc:87
#define warn(...)
Definition logging.hh:256
#define warn_if(cond,...)
Conditional warning macro that checks the supplied condition and only prints a warning if the conditi...
Definition logging.hh:283
Bitfield< 55, 52 > ts
Bitfield< 0 > p
Tick ns
nanosecond
Definition core.cc:68
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
struct gem5::GEM5_PACKED AMDKernelCode
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
void exitSimLoop(const std::string &message, int exit_code, Tick when, Tick repeat, bool serialize)
Schedule an event to exit the simulation loop (returning to Python) at the end of the current cycle (...
Definition sim_events.cc:88
uint16_t RequestorID
Definition request.hh:95
std::unique_ptr< TranslationGen > TranslationGenPtr
PM4 packets.
uint32_t enable_sgpr_flat_scratch_init
uint32_t enable_sgpr_queue_ptr
uint32_t enable_exception_memory
uint32_t enable_exception_fp_denormal_source
uint32_t enable_exception_address_watch
uint32_t enable_private_segment
uint32_t enable_sgpr_workgroup_id_y
uint32_t enable_exception_ieee_754_fp_inexact
uint32_t enable_ieee_mode
uint32_t group_segment_fixed_size
uint32_t float_mode_round_32
uint32_t enable_exception_int_divide_by_zero
uint32_t granulated_wavefront_sgpr_count
uint32_t granulated_workitem_vgpr_count
uint32_t kernarg_preload_spec_length
uint32_t enable_sgpr_dispatch_ptr
uint32_t use_dynamic_stack
uint32_t granulated_lds_size
uint32_t float_mode_round_16_64
uint32_t float_mode_denorm_16_64
uint32_t float_mode_denorm_32
uint32_t kernarg_preload_spec_offset
uint32_t enable_sgpr_workgroup_id_z
uint32_t enable_sgpr_dispatch_id
uint32_t accum_offset
uint32_t user_sgpr_count
uint32_t enable_dx10_clamp
uint32_t enable_exception_ieee_754_fp_overflow
int64_t kernel_code_entry_byte_offset
uint32_t enable_exception_ieee_754_fp_underflow
uint32_t enable_vgpr_workitem_id
uint32_t kernarg_size
uint32_t enable_sgpr_private_segment_size
uint32_t enable_sgpr_kernarg_segment_ptr
uint32_t enable_sgpr_private_segment_buffer
uint32_t enable_sgpr_workgroup_id_x
uint32_t enable_exception_ieee_754_fp_division_by_zero
uint32_t private_segment_fixed_size
uint32_t enable_trap_handler
uint32_t enable_sgpr_workgroup_info
uint32_t enable_exception_ieee_754_fp_invalid_operation
uint32_t enable_wavefront_size32
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0