41#include "debug/GPUDriver.hh"
42#include "debug/GPUShader.hh"
51#include "params/GPUComputeDriver.hh"
62 isdGPU(
p.isdGPU), gfxVersion(
p.gfxVersion), dGPUPoolID(
p.dGPUPoolID),
63 eventPage(0), eventSlotIndex(0)
66 DPRINTF(GPUDriver,
"Constructing KFD: device\n");
70 std::bitset<MtypeFlags::NUM_MTYPE_BITS> mtype(
p.m_type);
87 return "DriverWakeupEvent";
98 auto device_fd_entry = std::make_shared<DeviceFDEntry>(
this,
filename);
99 int tgt_fd = process->fds->allocFD(device_fd_entry);
109 int prot,
int tgt_flags,
int tgt_fd, off_t
offset)
116 DPRINTF(GPUDriver,
"amdkfd mmap (start: %p, length: 0x%x,"
117 "offset: 0x%x)\n", start, length,
offset);
121 DPRINTF(GPUDriver,
"amdkfd mmap type DOORBELL offset\n");
122 start = mem_state->extendMmap(length);
127 DPRINTF(GPUDriver,
"amdkfd mmap type EVENTS offset\n");
129 "Start address should be provided by KFD\n");
131 "Requested length %d, expected length %d; length "
139 eventPage = mem_state->extendMmap(length);
144 warn_once(
"Unrecognized kfd mmap type %llx\n", mmap_type);
165 fatal(
"%s: Exceeded maximum number of HSA queues allowed\n",
name());
178 args->ring_base_address, args->queue_id,
187 driver->schedule(
this,
curTick() + wakeup_delay);
194 "Trying wakeup on an event that is not yet created\n");
195 if (
ETable[event_id].threadWaiting) {
197 "No thread context to wake up\n");
200 "Signal event: Waking up CPU %d\n", tc->cpuId());
213 ETable[event_id].setEvent =
true;
221 "Timer event: Waking up CPU %d\n", tc->cpuId());
223 driver->TCEvents[tc].clearEvents();
240 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_VERSION\n");
251 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
262 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
263 "queue offset %d\n", args->queue_id);
283 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
288 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
294 args->system_clock_freq = 1000000000;
301 args->gpu_clock_counter = elapsed_nsec;
302 args->cpu_clock_counter = elapsed_nsec;
303 args->system_clock_counter = elapsed_nsec;
310 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
313 args->num_of_nodes = 1;
321 for (
int i = 0;
i < args->num_of_nodes; ++
i) {
330 case GfxVersion::gfx801:
331 case GfxVersion::gfx803:
332 args->process_apertures[
i].scratch_base =
334 args->process_apertures[
i].lds_base =
337 case GfxVersion::gfx900:
338 case GfxVersion::gfx902:
339 args->process_apertures[
i].scratch_base =
341 args->process_apertures[
i].lds_base =
345 fatal(
"Invalid gfx version\n");
349 args->process_apertures[
i].scratch_limit =
352 args->process_apertures[
i].lds_limit =
356 case GfxVersion::gfx801:
357 args->process_apertures[
i].gpuvm_base =
359 args->process_apertures[
i].gpuvm_limit =
362 case GfxVersion::gfx803:
363 case GfxVersion::gfx900:
364 case GfxVersion::gfx902:
366 args->process_apertures[
i].gpuvm_base = 0x1000000ull;
368 args->process_apertures[
i].gpuvm_limit =
369 0x0000800000000000ULL - 1;
372 fatal(
"Invalid gfx version");
386 case GfxVersion::gfx803:
387 args->process_apertures[
i].gpu_id = 50156;
389 case GfxVersion::gfx900:
390 args->process_apertures[
i].gpu_id = 22124;
393 fatal(
"Invalid gfx version for dGPU\n");
397 case GfxVersion::gfx801:
398 case GfxVersion::gfx902:
399 args->process_apertures[
i].gpu_id = 2765;
402 fatal(
"Invalid gfx version for APU\n");
406 DPRINTF(GPUDriver,
"GPUVM base for node[%i] = %#x\n",
i,
407 args->process_apertures[
i].gpuvm_base);
408 DPRINTF(GPUDriver,
"GPUVM limit for node[%i] = %#x\n",
i,
409 args->process_apertures[
i].gpuvm_limit);
411 DPRINTF(GPUDriver,
"LDS base for node[%i] = %#x\n",
i,
412 args->process_apertures[
i].lds_base);
413 DPRINTF(GPUDriver,
"LDS limit for node[%i] = %#x\n",
i,
414 args->process_apertures[
i].lds_limit);
416 DPRINTF(GPUDriver,
"Scratch base for node[%i] = %#x\n",
i,
417 args->process_apertures[
i].scratch_base);
418 DPRINTF(GPUDriver,
"Scratch limit for node[%i] = %#x\n",
i,
419 args->process_apertures[
i].scratch_limit);
427 assert(bits<Addr>(args->process_apertures[
i].scratch_base, 63,
429 assert(bits<Addr>(args->process_apertures[
i].scratch_base, 63,
431 assert(bits<Addr>(args->process_apertures[
i].scratch_limit, 63,
433 assert(bits<Addr>(args->process_apertures[
i].scratch_limit, 63,
435 assert(bits<Addr>(args->process_apertures[
i].lds_base, 63,
437 assert(bits<Addr>(args->process_apertures[
i].lds_base, 63,
439 assert(bits<Addr>(args->process_apertures[
i].lds_limit, 63,
441 assert(bits<Addr>(args->process_apertures[
i].lds_limit, 63,
450 warn(
"unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
455 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_CREATE_EVENT\n");
460 warn(
"Signal events are only supported currently\n");
462 fatal(
"Signal event wasn't created; signal limit reached\n");
466 uint64_t page_index = 0;
473 args->event_trigger_data = args->event_id;
474 DPRINTF(GPUDriver,
"amdkfd create events"
475 "(event_id: 0x%x, offset: 0x%x)\n",
476 args->event_id, args->event_page_offset);
489 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
492 DPRINTF(GPUDriver,
"amdkfd destroying event %d\n", args->event_id);
494 "Event ID invalid, cannot destroy this event\n");
495 ETable.erase(args->event_id);
500 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_SET_EVENTS\n");
503 DPRINTF(GPUDriver,
"amdkfd set event %d\n", args->event_id);
505 "Event ID invlaid, cannot set this event\n");
506 ETable[args->event_id].setEvent =
true;
512 warn(
"unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
517 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
522 DPRINTF(GPUDriver,
"amdkfd wait for events"
523 "(wait on all: %d, timeout : %d, num_events: %s)\n",
524 args->wait_for_all, args->timeout, args->num_events);
525 panic_if(args->wait_for_all != 0 && args->num_events > 1,
526 "Wait for all events not supported\n");
527 bool should_sleep =
true;
531 TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
532 std::make_tuple(
this, tc));
533 DPRINTF(GPUDriver,
"\tamdkfd creating event list"
534 " for thread %d\n", tc->
cpuId());
537 "There are %d events that put this thread to sleep,"
538 " this thread should not be running\n",
540 for (
int i = 0;
i < args->num_events;
i++) {
542 "Event pointer invalid\n");
546 EventData.
copyIn(virt_proxy);
548 "\tamdkfd wait for event %d\n", EventData->event_id);
550 "Event ID invalid, cannot set this event\n");
551 if (
ETable[EventData->event_id].threadWaiting)
552 warn(
"Multiple threads waiting on the same event\n");
553 if (
ETable[EventData->event_id].setEvent) {
556 ETable[EventData->event_id].setEvent =
false;
557 should_sleep =
false;
561 ETable[EventData->event_id].threadWaiting =
true;
562 ETable[EventData->event_id].tc = tc;
563 TCEvents[tc].signalEvents.insert(EventData->event_id);
571 args->wait_result = 0;
584 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
589 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
594 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
599 warn(
"unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
604 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_SCRATCH_BACKING_VA\n");
609 warn(
"unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
614 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
620 "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
625 ioc_args.
copyIn(virt_proxy);
626 ioc_args->num_of_nodes = 1;
628 for (
int i = 0;
i < ioc_args->num_of_nodes; ++
i) {
630 (ioc_args->kfd_process_device_apertures_ptr);
633 case GfxVersion::gfx801:
634 case GfxVersion::gfx803:
638 case GfxVersion::gfx900:
639 case GfxVersion::gfx902:
644 fatal(
"Invalid gfx version\n");
648 ape_args->scratch_limit =
650 ape_args->lds_limit =
ldsApeLimit(ape_args->lds_base);
653 case GfxVersion::gfx801:
655 ape_args->gpuvm_limit =
658 case GfxVersion::gfx803:
659 case GfxVersion::gfx900:
660 case GfxVersion::gfx902:
662 ape_args->gpuvm_base = 0x1000000ull;
664 ape_args->gpuvm_limit = 0x0000800000000000ULL - 1;
667 fatal(
"Invalid gfx version\n");
673 case GfxVersion::gfx803:
674 ape_args->gpu_id = 50156;
676 case GfxVersion::gfx900:
677 ape_args->gpu_id = 22124;
680 fatal(
"Invalid gfx version for dGPU\n");
684 case GfxVersion::gfx801:
685 case GfxVersion::gfx902:
686 ape_args->gpu_id = 2765;
689 fatal(
"Invalid gfx version for APU\n");
693 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
694 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
695 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
696 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
697 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
698 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
699 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
700 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
710 warn(
"unimplemented ioctl: AMDKFD_IOC_ACQUIRE_VM\n");
731 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
737 [[maybe_unused]]
Addr mmap_offset = 0;
743 bool cacheable =
true;
746 DPRINTF(GPUDriver,
"amdkfd allocation type: VRAM\n");
747 args->mmap_offset = args->va_addr;
767 pa_addr = process->seWorkload->allocPhysPages(
775 DPRINTF(GPUDriver,
"Mapping VA %p to framebuffer PA %p size "
776 "%d\n", args->va_addr, pa_addr, args->
size);
779 DPRINTF(GPUDriver,
"amdkfd allocation type: USERPTR\n");
780 mmap_offset = args->mmap_offset;
783 pa_addr = process->seWorkload->allocPhysPages(npages);
785 DPRINTF(GPUDriver,
"Mapping VA %p to framebuffer PA %p size "
786 "%d\n", args->va_addr, pa_addr, args->
size);
795 DPRINTF(GPUDriver,
"amdkfd allocation type: GTT\n");
796 args->mmap_offset = args->va_addr;
804 pa_addr = process->seWorkload->allocPhysPages(npages);
806 DPRINTF(GPUDriver,
"Mapping VA %p to framebuffer PA %p size "
807 "%d\n", args->va_addr, pa_addr, args->
size);
819 DPRINTF(GPUDriver,
"amdkfd allocation type: DOORBELL\n");
829 DPRINTF(GPUDriver,
"amdkfd allocation arguments: va_addr %p "
830 "size %lu, mmap_offset %p, gpu_id %d\n",
831 args->va_addr, args->
size, mmap_offset, args->gpu_id);
835 process->pTable->map(args->va_addr, pa_addr, args->
size,
852 args->handle= args->va_addr;
858 DPRINTF(GPUDriver,
"ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
863 DPRINTF(GPUDriver,
"amdkfd free arguments: handle %p ",
868 process->pTable->unmap(args->handle, size);
887 warn(
"unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
892 warn(
"unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
897 warn(
"unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
902 warn(
"unimplemented ioctl: AMDKFD_IOC_GET_QUEUE_WAVE_STATE\n");
907 warn(
"unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
912 warn(
"unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
917 warn(
"unimplemented ioctl: AMDKFD_IOC_ALLOC_QUEUE_GWS\n");
922 warn(
"unimplemented ioctl: AMDKFD_IOC_SMI_EVENTS\n");
926 fatal(
"%s: bad ioctl %d\n", req);
936 Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
938 TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
941 "CPU %d is put to sleep\n", tc->
cpuId());
947 return ((
Addr)gpuNum << 61) + 0x1000000000000L;
953 return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
959 return ((
Addr)gpuNum << 61) + 0x100000000L;
967 return ((
Addr)0x1 << 48);
973 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
979 return ((
Addr)gpuNum << 61) + 0x0;
987 return ((
Addr)0x2 << 48);
993 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
1001 DPRINTF(GPUDriver,
"Registering [%p - %p] with MTYPE %d\n",
1002 range.
start(), range.
end(), mtype);
1004 "Attempted to double register Mtypes for [%p - %p]\n",
1011 auto vma =
gpuVmas.contains(start);
1013 assert((vma->first.start() == start));
1014 Addr size = vma->first.size();
1015 DPRINTF(GPUDriver,
"Unregistering [%p - %p]\n", vma->first.start(),
1028 auto vma =
gpuVmas.contains(range);
1030 DPRINTF(GPUShader,
"Setting req from [%p - %p] MTYPE %d\n"
1031 "%d\n", range.
start(), range.
end(), vma->second);
1032 req->setCacheCoherenceFlags(vma->second);
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
bool copyIn(const PortProxy &memproxy)
copy data into simulator space (read from target memory)
const int size
buffer size
bool copyOut(const PortProxy &memproxy)
copy data out of simulator space (write to target memory)
EmulatedDriver is an abstract base class for fake SE-mode device drivers.
const std::string & filename
filename for opening this driver (under /dev)
HSAPacketProcessor & hsaPacketProc()
void attachDriver(GPUComputeDriver *driver)
void scheduleWakeup(Tick wakeup_delay)
const char * description() const override
Return a C string describing the event.
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length)
Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs.
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
virtual void signalWakeupEvent(uint32_t event_id)
int open(ThreadContext *tc, int mode, int flags) override
Create an FD entry for the KFD inside of the owning process.
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override
Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a pre...
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
Addr scratchApeLimit(Addr apeBase) const
GPUComputeDriver(const Params &p)
Addr deallocateGpuVma(Addr start)
Addr scratchApeBase(int gpuNum) const
Addr scratchApeBaseV9() const
std::unordered_map< ThreadContext *, EventList > TCEvents
Addr gpuVmApeBase(int gpuNum) const
The aperture (APE) base/limit pairs are set statically at startup by the real KFD.
Addr ldsApeBaseV9() const
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr)
Forward relevant parameters to packet processor; queueId is used to link doorbell.
Request::CacheCoherenceFlags defaultMtype
GPUComputeDriverParams Params
std::unordered_map< uint32_t, ETEntry > ETable
Addr mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) override
Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbell...
GPUCommandProcessor * device
GPU that is controlled by this driver.
AddrRangeMap< Request::CacheCoherenceFlags, 1 > gpuVmas
VMA structures for GPUVM memory.
Addr ldsApeBase(int gpuNum) const
Addr ldsApeLimit(Addr apeBase) const
Addr gpuVmApeLimit(Addr apeBase) const
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
virtual std::string name() const
This object is a proxy for a port or other object which implements the functional response protocol,...
std::shared_ptr< MemState > memState
ThreadContext is the external interface to all thread state for anything outside of the CPU.
virtual BaseMMU * getMMUPtr()=0
virtual Process * getProcessPtr()=0
virtual void suspend()=0
Set the status to Suspended.
virtual int cpuId() const =0
This proxy attempts to translate virtual addresses using the TLBs.
TypedBufferArg is a class template; instances of this template represent typed buffers in target user...
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
The GPUComputeDriver implements an HSADriver for an HSA AMD GPU agent.
AddrRange RangeSize(Addr start, Addr size)
Addr end() const
Get the end address of the range.
Addr start() const
Get the start address of the range.
static constexpr T divCeil(const T &a, const U &b)
void set(Type mask)
Set all flag's bits matching the given mask.
void clear()
Clear all flag's bits.
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
#define fatal(...)
This implements a cprintf based fatal() function.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
#define KFD_MMAP_TYPE_DOORBELL
#define KFD_MMAP_TYPE_MASK
#define KFD_MMAP_TYPE_EVENTS
#define KFD_MMAP_GPU_ID(gpu_id)
#define AMDKFD_IOC_RESET_EVENT
#define AMDKFD_IOC_GET_CLOCK_COUNTERS
#define AMDKFD_IOC_GET_DMABUF_INFO
#define AMDKFD_IOC_IMPORT_DMABUF
#define KFD_IOCTL_MAJOR_VERSION
#define AMDKFD_IOC_SET_MEMORY_POLICY
#define AMDKFD_IOC_GET_VERSION
#define AMDKFD_IOC_DESTROY_EVENT
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL
#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA
#define KFD_SIGNAL_EVENT_LIMIT
#define AMDKFD_IOC_DBG_REGISTER
#define AMDKFD_IOC_ACQUIRE_VM
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW
#define KFD_IOC_EVENT_SIGNAL
#define AMDKFD_IOC_CREATE_EVENT
#define AMDKFD_IOC_WAIT_EVENTS
#define AMDKFD_IOC_DESTROY_QUEUE
#define AMDKFD_IOC_SMI_EVENTS
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU
#define AMDKFD_IOC_GET_TILE_CONFIG
#define AMDKFD_IOC_SET_EVENT
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU
#define AMDKFD_IOC_DBG_UNREGISTER
#define AMDKFD_IOC_SET_CU_MASK
#define AMDKFD_IOC_CREATE_QUEUE
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU
#define KFD_IOCTL_MINOR_VERSION
#define AMDKFD_IOC_GET_PROCESS_APERTURES
#define AMDKFD_IOC_DBG_WAVE_CONTROL
#define AMDKFD_IOC_UPDATE_QUEUE
#define AMDKFD_IOC_DBG_ADDRESS_WATCH
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT
#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM
#define AMDKFD_IOC_ALLOC_QUEUE_GWS
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU
#define AMDKFD_IOC_SET_TRAP_HANDLER
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Tick curTick()
The universal simulation clock.
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
uint64_t Tick
Tick count type.
PortProxy Object Declaration.
This file defines buffer classes used to handle pointer arguments in emulated syscalls.