41#include "debug/GPUDriver.hh"
42#include "debug/GPUShader.hh"
51#include "params/GPUComputeDriver.hh"
62 isdGPU(
p.isdGPU), gfxVersion(
p.gfxVersion), dGPUPoolID(
p.dGPUPoolID),
63 eventPage(0), eventSlotIndex(0)
66 DPRINTF(GPUDriver,
"Constructing KFD: device\n");
70 std::bitset<MtypeFlags::NUM_MTYPE_BITS> mtype(
p.m_type);
87 return "DriverWakeupEvent";
98 auto device_fd_entry = std::make_shared<DeviceFDEntry>(
this,
filename);
99 int tgt_fd = process->fds->allocFD(device_fd_entry);
109 int prot,
int tgt_flags,
int tgt_fd, off_t
offset)
116 DPRINTF(GPUDriver,
"amdkfd mmap (start: %p, length: 0x%x,"
117 "offset: 0x%x)\n", start, length,
offset);
121 DPRINTF(GPUDriver,
"amdkfd mmap type DOORBELL offset\n");
122 start = mem_state->extendMmap(length);
127 DPRINTF(GPUDriver,
"amdkfd mmap type EVENTS offset\n");
129 "Start address should be provided by KFD\n");
131 "Requested length %d, expected length %d; length "
139 eventPage = mem_state->extendMmap(length);
144 warn_once(
"Unrecognized kfd mmap type %llx\n", mmap_type);
165 fatal(
"%s: Exceeded maximum number of HSA queues allowed\n",
name());
178 args->ring_base_address, args->queue_id,
187 driver->schedule(
this,
curTick() + wakeup_delay);
194 "Trying wakeup on an event that is not yet created\n");
195 if (
ETable[event_id].threadWaiting) {
197 "No thread context to wake up\n");
200 "Signal event: Waking up CPU %d\n", tc->cpuId());
213 ETable[event_id].setEvent =
true;
221 "Timer event: Waking up CPU %d\n", tc->cpuId());
223 driver->TCEvents[tc].clearEvents();
240 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_VERSION\n");
251 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
262 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
263 "queue offset %d\n", args->queue_id);
283 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
288 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
294 args->system_clock_freq = 1000000000;
301 args->gpu_clock_counter = elapsed_nsec;
302 args->cpu_clock_counter = elapsed_nsec;
303 args->system_clock_counter = elapsed_nsec;
310 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
313 args->num_of_nodes = 1;
321 for (
int i = 0;
i < args->num_of_nodes; ++
i) {
330 case GfxVersion::gfx900:
331 case GfxVersion::gfx902:
332 args->process_apertures[
i].scratch_base =
334 args->process_apertures[
i].lds_base =
338 fatal(
"Invalid gfx version\n");
341 args->process_apertures[
i].scratch_limit =
344 args->process_apertures[
i].lds_limit =
348 case GfxVersion::gfx900:
349 case GfxVersion::gfx902:
351 args->process_apertures[
i].gpuvm_base = 0x1000000ull;
353 args->process_apertures[
i].gpuvm_limit =
354 0x0000800000000000ULL - 1;
357 fatal(
"Invalid gfx version");
371 case GfxVersion::gfx900:
372 args->process_apertures[
i].gpu_id = 22124;
375 fatal(
"Invalid gfx version for dGPU\n");
379 case GfxVersion::gfx902:
380 args->process_apertures[
i].gpu_id = 2765;
383 fatal(
"Invalid gfx version for APU\n");
387 DPRINTF(GPUDriver,
"GPUVM base for node[%i] = %#x\n",
i,
388 args->process_apertures[
i].gpuvm_base);
389 DPRINTF(GPUDriver,
"GPUVM limit for node[%i] = %#x\n",
i,
390 args->process_apertures[
i].gpuvm_limit);
392 DPRINTF(GPUDriver,
"LDS base for node[%i] = %#x\n",
i,
393 args->process_apertures[
i].lds_base);
394 DPRINTF(GPUDriver,
"LDS limit for node[%i] = %#x\n",
i,
395 args->process_apertures[
i].lds_limit);
397 DPRINTF(GPUDriver,
"Scratch base for node[%i] = %#x\n",
i,
398 args->process_apertures[
i].scratch_base);
399 DPRINTF(GPUDriver,
"Scratch limit for node[%i] = %#x\n",
i,
400 args->process_apertures[
i].scratch_limit);
408 assert(
bits<Addr>(args->process_apertures[
i].scratch_base, 63,
410 assert(
bits<Addr>(args->process_apertures[
i].scratch_base, 63,
412 assert(
bits<Addr>(args->process_apertures[
i].scratch_limit, 63,
414 assert(
bits<Addr>(args->process_apertures[
i].scratch_limit, 63,
416 assert(
bits<Addr>(args->process_apertures[
i].lds_base, 63,
418 assert(
bits<Addr>(args->process_apertures[
i].lds_base, 63,
420 assert(
bits<Addr>(args->process_apertures[
i].lds_limit, 63,
422 assert(
bits<Addr>(args->process_apertures[
i].lds_limit, 63,
431 warn(
"unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
436 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_CREATE_EVENT\n");
441 warn(
"Signal events are only supported currently\n");
443 fatal(
"Signal event wasn't created; signal limit reached\n");
447 uint64_t page_index = 0;
454 args->event_trigger_data = args->event_id;
455 DPRINTF(GPUDriver,
"amdkfd create events"
456 "(event_id: 0x%x, offset: 0x%x)\n",
457 args->event_id, args->event_page_offset);
470 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
473 DPRINTF(GPUDriver,
"amdkfd destroying event %d\n", args->event_id);
475 "Event ID invalid, cannot destroy this event\n");
476 ETable.erase(args->event_id);
481 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_SET_EVENTS\n");
484 DPRINTF(GPUDriver,
"amdkfd set event %d\n", args->event_id);
486 "Event ID invlaid, cannot set this event\n");
487 ETable[args->event_id].setEvent =
true;
493 warn(
"unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
498 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
503 DPRINTF(GPUDriver,
"amdkfd wait for events"
504 "(wait on all: %d, timeout : %d, num_events: %s)\n",
505 args->wait_for_all, args->timeout, args->num_events);
506 panic_if(args->wait_for_all != 0 && args->num_events > 1,
507 "Wait for all events not supported\n");
508 bool should_sleep =
true;
512 TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
513 std::make_tuple(
this, tc));
514 DPRINTF(GPUDriver,
"\tamdkfd creating event list"
515 " for thread %d\n", tc->
cpuId());
518 "There are %d events that put this thread to sleep,"
519 " this thread should not be running\n",
521 for (
int i = 0;
i < args->num_events;
i++) {
523 "Event pointer invalid\n");
527 EventData.
copyIn(virt_proxy);
529 "\tamdkfd wait for event %d\n", EventData->event_id);
531 "Event ID invalid, cannot set this event\n");
532 if (
ETable[EventData->event_id].threadWaiting)
533 warn(
"Multiple threads waiting on the same event\n");
534 if (
ETable[EventData->event_id].setEvent) {
537 ETable[EventData->event_id].setEvent =
false;
538 should_sleep =
false;
542 ETable[EventData->event_id].threadWaiting =
true;
543 ETable[EventData->event_id].tc = tc;
544 TCEvents[tc].signalEvents.insert(EventData->event_id);
552 args->wait_result = 0;
565 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
570 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
575 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
580 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
585 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_SCRATCH_BACKING_VA\n");
590 warn(
"unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
595 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
601 "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
606 ioc_args.
copyIn(virt_proxy);
607 ioc_args->num_of_nodes = 1;
609 for (
int i = 0;
i < ioc_args->num_of_nodes; ++
i) {
611 (ioc_args->kfd_process_device_apertures_ptr);
614 case GfxVersion::gfx900:
615 case GfxVersion::gfx902:
620 fatal(
"Invalid gfx version\n");
623 ape_args->scratch_limit =
625 ape_args->lds_limit =
ldsApeLimit(ape_args->lds_base);
628 case GfxVersion::gfx900:
629 case GfxVersion::gfx902:
631 ape_args->gpuvm_base = 0x1000000ull;
633 ape_args->gpuvm_limit = 0x0000800000000000ULL - 1;
636 fatal(
"Invalid gfx version\n");
642 case GfxVersion::gfx900:
643 ape_args->gpu_id = 22124;
646 fatal(
"Invalid gfx version for dGPU\n");
650 case GfxVersion::gfx902:
651 ape_args->gpu_id = 2765;
654 fatal(
"Invalid gfx version for APU\n");
658 assert(
bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
659 assert(
bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
660 assert(
bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
661 assert(
bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
662 assert(
bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
663 assert(
bits<Addr>(ape_args->lds_base, 63, 47) != 0);
664 assert(
bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
665 assert(
bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
675 warn(
"unimplemented ioctl: AMDKFD_IOC_ACQUIRE_VM\n");
696 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
702 [[maybe_unused]]
Addr mmap_offset = 0;
708 bool cacheable =
true;
711 DPRINTF(GPUDriver,
"amdkfd allocation type: VRAM\n");
712 args->mmap_offset = args->va_addr;
732 pa_addr = process->seWorkload->allocPhysPages(
740 DPRINTF(GPUDriver,
"Mapping VA %p to framebuffer PA %p size "
741 "%d\n", args->va_addr, pa_addr, args->
size);
744 DPRINTF(GPUDriver,
"amdkfd allocation type: USERPTR\n");
745 mmap_offset = args->mmap_offset;
748 pa_addr = process->seWorkload->allocPhysPages(npages);
750 DPRINTF(GPUDriver,
"Mapping VA %p to framebuffer PA %p size "
751 "%d\n", args->va_addr, pa_addr, args->
size);
760 DPRINTF(GPUDriver,
"amdkfd allocation type: GTT\n");
761 args->mmap_offset = args->va_addr;
769 pa_addr = process->seWorkload->allocPhysPages(npages);
771 DPRINTF(GPUDriver,
"Mapping VA %p to framebuffer PA %p size "
772 "%d\n", args->va_addr, pa_addr, args->
size);
784 DPRINTF(GPUDriver,
"amdkfd allocation type: DOORBELL\n");
794 DPRINTF(GPUDriver,
"amdkfd allocation arguments: va_addr %p "
795 "size %lu, mmap_offset %p, gpu_id %d\n",
796 args->va_addr, args->
size, mmap_offset, args->gpu_id);
800 process->pTable->map(args->va_addr, pa_addr, args->
size,
817 args->handle= args->va_addr;
823 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
828 DPRINTF(GPUDriver,
"amdkfd free arguments: handle %p ",
833 process->pTable->unmap(args->handle, size);
852 warn(
"unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
857 warn(
"unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
862 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
867 warn(
"unimplemented ioctl: AMDKFD_IOC_GET_QUEUE_WAVE_STATE\n");
872 warn(
"unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
877 warn(
"unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
882 warn(
"unimplemented ioctl: AMDKFD_IOC_ALLOC_QUEUE_GWS\n");
887 warn(
"unimplemented ioctl: AMDKFD_IOC_SMI_EVENTS\n");
891 fatal(
"%s: bad ioctl %d\n", req);
901 Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
903 TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
906 "CPU %d is put to sleep\n", tc->
cpuId());
912 return ((
Addr)gpuNum << 61) + 0x1000000000000L;
918 return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
924 return ((
Addr)gpuNum << 61) + 0x100000000L;
932 return ((
Addr)0x1 << 48);
938 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
944 return ((
Addr)gpuNum << 61) + 0x0;
952 return ((
Addr)0x2 << 48);
958 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
966 DPRINTF(GPUDriver,
"Registering [%p - %p] with MTYPE %d\n",
969 "Attempted to double register Mtypes for [%p - %p]\n",
978 assert((
vma->first.start() == start));
980 DPRINTF(GPUDriver,
"Unregistering [%p - %p]\n",
vma->first.start(),
995 DPRINTF(GPUShader,
"Setting req from [%p - %p] MTYPE %d\n"
997 req->setCacheCoherenceFlags(
vma->second);
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
bool copyIn(const PortProxy &memproxy)
copy data into simulator space (read from target memory)
const int size
buffer size
bool copyOut(const PortProxy &memproxy)
copy data out of simulator space (write to target memory)
EmulatedDriver is an abstract base class for fake SE-mode device drivers.
const std::string & filename
filename for opening this driver (under /dev)
HSAPacketProcessor & hsaPacketProc()
void attachDriver(GPUComputeDriver *driver)
void scheduleWakeup(Tick wakeup_delay)
const char * description() const override
Return a C string describing the event.
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length)
Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs.
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
virtual void signalWakeupEvent(uint32_t event_id)
int open(ThreadContext *tc, int mode, int flags) override
Create an FD entry for the KFD inside of the owning process.
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override
Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a pre...
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
Addr scratchApeLimit(Addr apeBase) const
GPUComputeDriver(const Params &p)
Addr deallocateGpuVma(Addr start)
Addr scratchApeBase(int gpuNum) const
Addr scratchApeBaseV9() const
std::unordered_map< ThreadContext *, EventList > TCEvents
Addr gpuVmApeBase(int gpuNum) const
The aperture (APE) base/limit pairs are set statically at startup by the real KFD.
Addr ldsApeBaseV9() const
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr)
Forward relevant parameters to packet processor; queueId is used to link doorbell.
Request::CacheCoherenceFlags defaultMtype
GPUComputeDriverParams Params
std::unordered_map< uint32_t, ETEntry > ETable
Addr mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) override
Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbell...
GPUCommandProcessor * device
GPU that is controlled by this driver.
AddrRangeMap< Request::CacheCoherenceFlags, 1 > gpuVmas
VMA structures for GPUVM memory.
Addr ldsApeBase(int gpuNum) const
Addr ldsApeLimit(Addr apeBase) const
Addr gpuVmApeLimit(Addr apeBase) const
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
virtual std::string name() const
This object is a proxy for a port or other object which implements the functional response protocol,...
std::shared_ptr< MemState > memState
ThreadContext is the external interface to all thread state for anything outside of the CPU.
virtual BaseMMU * getMMUPtr()=0
virtual Process * getProcessPtr()=0
virtual void suspend()=0
Set the status to Suspended.
virtual int cpuId() const =0
This proxy attempts to translate virtual addresses using the TLBs.
TypedBufferArg is a class template; instances of this template represent typed buffers in target user...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
The GPUComputeDriver implements an HSADriver for an HSA AMD GPU agent.
AddrRange RangeSize(Addr start, Addr size)
Addr end() const
Get the end address of the range.
Addr start() const
Get the start address of the range.
static constexpr T divCeil(const T &a, const U &b)
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
void set(Type mask)
Set all flag's bits matching the given mask.
void clear()
Clear all flag's bits.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
#define KFD_MMAP_TYPE_DOORBELL
#define KFD_MMAP_TYPE_MASK
#define KFD_MMAP_TYPE_EVENTS
#define KFD_MMAP_GPU_ID(gpu_id)
#define AMDKFD_IOC_RESET_EVENT
#define AMDKFD_IOC_GET_CLOCK_COUNTERS
#define AMDKFD_IOC_GET_DMABUF_INFO
#define AMDKFD_IOC_IMPORT_DMABUF
#define KFD_IOCTL_MAJOR_VERSION
#define AMDKFD_IOC_SET_MEMORY_POLICY
#define AMDKFD_IOC_GET_VERSION
#define AMDKFD_IOC_DESTROY_EVENT
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL
#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA
#define KFD_SIGNAL_EVENT_LIMIT
#define AMDKFD_IOC_DBG_REGISTER
#define AMDKFD_IOC_ACQUIRE_VM
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW
#define KFD_IOC_EVENT_SIGNAL
#define AMDKFD_IOC_CREATE_EVENT
#define AMDKFD_IOC_WAIT_EVENTS
#define AMDKFD_IOC_DESTROY_QUEUE
#define AMDKFD_IOC_SMI_EVENTS
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU
#define AMDKFD_IOC_GET_TILE_CONFIG
#define AMDKFD_IOC_SET_EVENT
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU
#define AMDKFD_IOC_DBG_UNREGISTER
#define AMDKFD_IOC_SET_CU_MASK
#define AMDKFD_IOC_CREATE_QUEUE
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU
#define KFD_IOCTL_MINOR_VERSION
#define AMDKFD_IOC_GET_PROCESS_APERTURES
#define AMDKFD_IOC_DBG_WAVE_CONTROL
#define AMDKFD_IOC_UPDATE_QUEUE
#define AMDKFD_IOC_DBG_ADDRESS_WATCH
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT
#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM
#define AMDKFD_IOC_ALLOC_QUEUE_GWS
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU
#define AMDKFD_IOC_SET_TRAP_HANDLER
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
PortProxy Object Declaration.
This file defines buffer classes used to handle pointer arguments in emulated syscalls.