gem5 v24.0.0.0
Loading...
Searching...
No Matches
gpu_compute_driver.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <memory>
35
36#include "arch/x86/page_size.hh"
37#include "base/compiler.hh"
38#include "base/logging.hh"
39#include "base/trace.hh"
40#include "cpu/thread_context.hh"
41#include "debug/GPUDriver.hh"
42#include "debug/GPUShader.hh"
45#include "dev/hsa/kfd_ioctl.h"
47#include "gpu-compute/shader.hh"
48#include "mem/port_proxy.hh"
51#include "params/GPUComputeDriver.hh"
52#include "sim/full_system.hh"
53#include "sim/process.hh"
54#include "sim/se_workload.hh"
56
57namespace gem5
58{
59
61 : EmulatedDriver(p), device(p.device), queueId(0),
62 isdGPU(p.isdGPU), gfxVersion(p.gfxVersion), dGPUPoolID(p.dGPUPoolID),
63 eventPage(0), eventSlotIndex(0)
64{
65 device->attachDriver(this);
66 DPRINTF(GPUDriver, "Constructing KFD: device\n");
67
68 // Convert the 3 bit mtype specified in Shader.py to the proper type
69 // used for requests.
70 std::bitset<MtypeFlags::NUM_MTYPE_BITS> mtype(p.m_type);
71 if (mtype.test(MtypeFlags::SHARED)) {
73 }
74
75 if (mtype.test(MtypeFlags::READ_WRITE)) {
77 }
78
79 if (mtype.test(MtypeFlags::CACHED)) {
81 }
82}
83
84const char*
86{
87 return "DriverWakeupEvent";
88}
89
93int
95{
96 DPRINTF(GPUDriver, "Opened %s\n", filename);
97 auto process = tc->getProcessPtr();
98 auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
99 int tgt_fd = process->fds->allocFD(device_fd_entry);
100 return tgt_fd;
101}
102
107Addr
108GPUComputeDriver::mmap(ThreadContext *tc, Addr start, uint64_t length,
109 int prot, int tgt_flags, int tgt_fd, off_t offset)
110{
111 auto process = tc->getProcessPtr();
112 auto mem_state = process->memState;
113
114 Addr pg_off = offset >> PAGE_SHIFT;
115 Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
116 DPRINTF(GPUDriver, "amdkfd mmap (start: %p, length: 0x%x,"
117 "offset: 0x%x)\n", start, length, offset);
118
119 switch(mmap_type) {
121 DPRINTF(GPUDriver, "amdkfd mmap type DOORBELL offset\n");
122 start = mem_state->extendMmap(length);
123 process->pTable->map(start, device->hsaPacketProc().pioAddr,
124 length, false);
125 break;
127 DPRINTF(GPUDriver, "amdkfd mmap type EVENTS offset\n");
128 panic_if(start != 0,
129 "Start address should be provided by KFD\n");
130 panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
131 "Requested length %d, expected length %d; length "
132 "mismatch\n", length, 8* KFD_SIGNAL_EVENT_LIMIT);
138 if (!eventPage) {
139 eventPage = mem_state->extendMmap(length);
140 start = eventPage;
141 }
142 break;
143 default:
144 warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
145 break;
146 }
147
148 return start;
149}
150
158void
160{
162 args.copyIn(mem_proxy);
163
164 if ((doorbellSize() * queueId) > 4096) {
165 fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
166 }
167
168 args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
169 KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
170
171 // for vega offset needs to include exact value of doorbell
172 if (doorbellSize())
173 args->doorbell_offset += queueId * doorbellSize();
174
175 args->queue_id = queueId++;
176 auto &hsa_pp = device->hsaPacketProc();
177 hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
178 args->ring_base_address, args->queue_id,
179 args->ring_size, doorbellSize(), gfxVersion);
180 args.copyOut(mem_proxy);
181}
182
183void
185{
186 assert(driver);
187 driver->schedule(this, curTick() + wakeup_delay);
188}
189
190void
192{
193 panic_if(event_id >= eventSlotIndex,
194 "Trying wakeup on an event that is not yet created\n");
195 if (ETable[event_id].threadWaiting) {
196 panic_if(!ETable[event_id].tc,
197 "No thread context to wake up\n");
198 ThreadContext *tc = ETable[event_id].tc;
199 DPRINTF(GPUDriver,
200 "Signal event: Waking up CPU %d\n", tc->cpuId());
201 // Remove events that can wakeup this thread
202 TCEvents[tc].clearEvents();
203 // Now wakeup this thread
204 tc->activate();
205 } else {
206 // This may be a race condition between an ioctl call asking to wait on
207 // this event and this signalWakeupEvent. Taking care of this race
208 // condition here by setting the event here. The ioctl call should take
209 // the necessary action when waiting on an already set event. However,
210 // this may be a genuine instance in which the runtime has decided not
211 // to wait on this event. But since we cannot distinguish this case with
212 // the race condition, we are any way setting the event.
213 ETable[event_id].setEvent = true;
214 }
215}
216
217void
219{
220 DPRINTF(GPUDriver,
221 "Timer event: Waking up CPU %d\n", tc->cpuId());
222 // Remove events that can wakeup this thread
223 driver->TCEvents[tc].clearEvents();
224 // Now wakeup this thread
225 tc->activate();
226}
227
228int
229GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
230{
231 TranslatingPortProxy fs_proxy(tc);
232 SETranslatingPortProxy se_proxy(tc);
233 PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
234 auto process = tc->getProcessPtr();
235 auto mem_state = process->memState;
236
237 switch (req) {
239 {
240 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
241
243 args->major_version = KFD_IOCTL_MAJOR_VERSION;
244 args->minor_version = KFD_IOCTL_MINOR_VERSION;
245
246 args.copyOut(virt_proxy);
247 }
248 break;
250 {
251 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
252
253 allocateQueue(virt_proxy, ioc_buf);
254
255 DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
256 }
257 break;
259 {
261 args.copyIn(virt_proxy);
262 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
263 "queue offset %d\n", args->queue_id);
264 device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id,
265 doorbellSize());
266 }
267 break;
269 {
283 warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
284 }
285 break;
287 {
288 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
289
291 args.copyIn(virt_proxy);
292
293 // Set nanosecond resolution
294 args->system_clock_freq = 1000000000;
295
300 uint64_t elapsed_nsec = curTick() / sim_clock::as_int::ns;
301 args->gpu_clock_counter = elapsed_nsec;
302 args->cpu_clock_counter = elapsed_nsec;
303 args->system_clock_counter = elapsed_nsec;
304
305 args.copyOut(virt_proxy);
306 }
307 break;
309 {
310 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
311
313 args->num_of_nodes = 1;
314
321 for (int i = 0; i < args->num_of_nodes; ++i) {
329 switch (gfxVersion) {
330 case GfxVersion::gfx900:
331 case GfxVersion::gfx902:
332 args->process_apertures[i].scratch_base =
334 args->process_apertures[i].lds_base =
335 ldsApeBaseV9();
336 break;
337 default:
338 fatal("Invalid gfx version\n");
339 }
340
341 args->process_apertures[i].scratch_limit =
342 scratchApeLimit(args->process_apertures[i].scratch_base);
343
344 args->process_apertures[i].lds_limit =
345 ldsApeLimit(args->process_apertures[i].lds_base);
346
347 switch (gfxVersion) {
348 case GfxVersion::gfx900:
349 case GfxVersion::gfx902:
350 // Taken from SVM_USE_BASE in Linux kernel
351 args->process_apertures[i].gpuvm_base = 0x1000000ull;
352 // Taken from AMDGPU_GMC_HOLE_START in Linux kernel
353 args->process_apertures[i].gpuvm_limit =
354 0x0000800000000000ULL - 1;
355 break;
356 default:
357 fatal("Invalid gfx version");
358 }
359
360 // NOTE: Must match ID populated by hsaTopology.py
361 //
362 // https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/
363 // blob/6a986c0943e9acd8c4c0cf2a9d510ff42167b43f/include/uapi/
364 // linux/kfd_ioctl.h#L564
365 //
366 // The gpu_id is a device identifier used by the driver for
367 // ioctls that allocate arguments. Each device has an unique
368 // id composed out of a non-zero base and an offset.
369 if (isdGPU) {
370 switch (gfxVersion) {
371 case GfxVersion::gfx900:
372 args->process_apertures[i].gpu_id = 22124;
373 break;
374 default:
375 fatal("Invalid gfx version for dGPU\n");
376 }
377 } else {
378 switch (gfxVersion) {
379 case GfxVersion::gfx902:
380 args->process_apertures[i].gpu_id = 2765;
381 break;
382 default:
383 fatal("Invalid gfx version for APU\n");
384 }
385 }
386
387 DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
388 args->process_apertures[i].gpuvm_base);
389 DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
390 args->process_apertures[i].gpuvm_limit);
391
392 DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
393 args->process_apertures[i].lds_base);
394 DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
395 args->process_apertures[i].lds_limit);
396
397 DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
398 args->process_apertures[i].scratch_base);
399 DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
400 args->process_apertures[i].scratch_limit);
401
408 assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
409 47) != 0x1ffff);
410 assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
411 47) != 0);
412 assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
413 47) != 0x1ffff);
414 assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
415 47) != 0);
416 assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
417 47) != 0x1ffff);
418 assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
419 47) != 0);
420 assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
421 47) != 0x1ffff);
422 assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
423 47) != 0);
424 }
425
426 args.copyOut(virt_proxy);
427 }
428 break;
430 {
431 warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
432 }
433 break;
435 {
436 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n");
437
439 args.copyIn(virt_proxy);
440 if (args->event_type != KFD_IOC_EVENT_SIGNAL) {
441 warn("Signal events are only supported currently\n");
442 } else if (eventSlotIndex == SLOTS_PER_PAGE) {
443 fatal("Signal event wasn't created; signal limit reached\n");
444 }
445 // Currently, we allocate only one signal_page for events.
446 // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT
447 uint64_t page_index = 0;
448 args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS);
449 args->event_page_offset <<= PAGE_SHIFT;
450 // TODO: Currently we support only signal events, hence using
451 // the same ID for both signal slot and event slot
452 args->event_slot_index = eventSlotIndex;
453 args->event_id = eventSlotIndex++;
454 args->event_trigger_data = args->event_id;
455 DPRINTF(GPUDriver, "amdkfd create events"
456 "(event_id: 0x%x, offset: 0x%x)\n",
457 args->event_id, args->event_page_offset);
458 // Since eventSlotIndex is increased everytime a new event is
459 // created ETable at eventSlotIndex(event_id) is guaranteed to be
460 // empty. In a future implementation that reuses deleted event_ids,
461 // we should check if event table at this
462 // eventSlotIndex(event_id) is empty before inserting a new event
463 // table entry
464 ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {}));
465 args.copyOut(virt_proxy);
466 }
467 break;
469 {
470 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
472 args.copyIn(virt_proxy);
473 DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id);
474 fatal_if(ETable.count(args->event_id) == 0,
475 "Event ID invalid, cannot destroy this event\n");
476 ETable.erase(args->event_id);
477 }
478 break;
480 {
481 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n");
483 args.copyIn(virt_proxy);
484 DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id);
485 fatal_if(ETable.count(args->event_id) == 0,
486 "Event ID invlaid, cannot set this event\n");
487 ETable[args->event_id].setEvent = true;
488 signalWakeupEvent(args->event_id);
489 }
490 break;
492 {
493 warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
494 }
495 break;
497 {
498 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
500 args.copyIn(virt_proxy);
501 kfd_event_data *events =
502 (kfd_event_data *)args->events_ptr;
503 DPRINTF(GPUDriver, "amdkfd wait for events"
504 "(wait on all: %d, timeout : %d, num_events: %s)\n",
505 args->wait_for_all, args->timeout, args->num_events);
506 panic_if(args->wait_for_all != 0 && args->num_events > 1,
507 "Wait for all events not supported\n");
508 bool should_sleep = true;
509 if (TCEvents.count(tc) == 0) {
510 // This thread context trying to wait on an event for the first
511 // time, initialize it.
512 TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
513 std::make_tuple(this, tc));
514 DPRINTF(GPUDriver, "\tamdkfd creating event list"
515 " for thread %d\n", tc->cpuId());
516 }
517 panic_if(TCEvents[tc].signalEvents.size() != 0,
518 "There are %d events that put this thread to sleep,"
519 " this thread should not be running\n",
520 TCEvents[tc].signalEvents.size());
521 for (int i = 0; i < args->num_events; i++) {
522 panic_if(!events,
523 "Event pointer invalid\n");
524 Addr eventDataAddr = (Addr)(events + i);
526 eventDataAddr, sizeof(kfd_event_data));
527 EventData.copyIn(virt_proxy);
528 DPRINTF(GPUDriver,
529 "\tamdkfd wait for event %d\n", EventData->event_id);
530 panic_if(ETable.count(EventData->event_id) == 0,
531 "Event ID invalid, cannot set this event\n");
532 if (ETable[EventData->event_id].threadWaiting)
533 warn("Multiple threads waiting on the same event\n");
534 if (ETable[EventData->event_id].setEvent) {
535 // If event is already set, the event has already happened.
536 // Just unset the event and dont put this thread to sleep.
537 ETable[EventData->event_id].setEvent = false;
538 should_sleep = false;
539 }
540 if (should_sleep) {
541 // Put this thread to sleep
542 ETable[EventData->event_id].threadWaiting = true;
543 ETable[EventData->event_id].tc = tc;
544 TCEvents[tc].signalEvents.insert(EventData->event_id);
545 }
546 }
547
548 // TODO: Return the correct wait_result back. Currently, returning
549 // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE.
550 // Ideally, this needs to be done after the event is triggered and
551 // after the thread is woken up.
552 args->wait_result = 0;
553 args.copyOut(virt_proxy);
554 if (should_sleep) {
555 // Put this thread to sleep
556 sleepCPU(tc, args->timeout);
557 } else {
558 // Remove events that tried to put this thread to sleep
559 TCEvents[tc].clearEvents();
560 }
561 }
562 break;
564 {
565 warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
566 }
567 break;
569 {
570 warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
571 }
572 break;
574 {
575 warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
576 }
577 break;
579 {
580 warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
581 }
582 break;
584 {
585 warn("unimplemented ioctl: AMDKFD_IOC_SET_SCRATCH_BACKING_VA\n");
586 }
587 break;
589 {
590 warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
591 }
592 break;
594 {
595 warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
596 }
597 break;
599 {
600 DPRINTF(GPUDriver,
601 "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
602
604 ioc_args(ioc_buf);
605
606 ioc_args.copyIn(virt_proxy);
607 ioc_args->num_of_nodes = 1;
608
609 for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
611 (ioc_args->kfd_process_device_apertures_ptr);
612
613 switch (gfxVersion) {
614 case GfxVersion::gfx900:
615 case GfxVersion::gfx902:
616 ape_args->scratch_base = scratchApeBaseV9();
617 ape_args->lds_base = ldsApeBaseV9();
618 break;
619 default:
620 fatal("Invalid gfx version\n");
621 }
622
623 ape_args->scratch_limit =
624 scratchApeLimit(ape_args->scratch_base);
625 ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
626
627 switch (gfxVersion) {
628 case GfxVersion::gfx900:
629 case GfxVersion::gfx902:
630 // Taken from SVM_USE_BASE in Linux kernel
631 ape_args->gpuvm_base = 0x1000000ull;
632 // Taken from AMDGPU_GMC_HOLE_START in Linux kernel
633 ape_args->gpuvm_limit = 0x0000800000000000ULL - 1;
634 break;
635 default:
636 fatal("Invalid gfx version\n");
637 }
638
639 // NOTE: Must match ID populated by hsaTopology.py
640 if (isdGPU) {
641 switch (gfxVersion) {
642 case GfxVersion::gfx900:
643 ape_args->gpu_id = 22124;
644 break;
645 default:
646 fatal("Invalid gfx version for dGPU\n");
647 }
648 } else {
649 switch (gfxVersion) {
650 case GfxVersion::gfx902:
651 ape_args->gpu_id = 2765;
652 break;
653 default:
654 fatal("Invalid gfx version for APU\n");
655 }
656 }
657
658 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
659 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
660 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
661 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
662 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
663 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
664 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
665 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
666
667 ape_args.copyOut(virt_proxy);
668 }
669
670 ioc_args.copyOut(virt_proxy);
671 }
672 break;
674 {
675 warn("unimplemented ioctl: AMDKFD_IOC_ACQUIRE_VM\n");
676 }
677 break;
695 {
696 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
698 args.copyIn(virt_proxy);
699
700 assert(isdGPU || gfxVersion == GfxVersion::gfx902);
701 assert((args->va_addr % X86ISA::PageBytes) == 0);
702 [[maybe_unused]] Addr mmap_offset = 0;
703
705 Addr pa_addr = 0;
706
707 int npages = divCeil(args->size, (int64_t)X86ISA::PageBytes);
708 bool cacheable = true;
709
710 if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
711 DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
712 args->mmap_offset = args->va_addr;
713 // VRAM allocations are device memory mapped into GPUVM
714 // space.
715 //
716 // We can't rely on the lazy host allocator (fixupFault) to
717 // handle this mapping since it needs to be placed in dGPU
718 // framebuffer memory. The lazy allocator will try to place
719 // this in host memory.
720 //
721 // TODO: We don't have the appropriate bifurcation of the
722 // physical address space with different memory controllers
723 // yet. This is where we will explicitly add the PT maps to
724 // dGPU memory in the future.
725 //
726 // Bind the VA space to the dGPU physical memory pool. Mark
727 // this region as Uncacheable. The Uncacheable flag is only
728 // really used by the CPU and is ignored by the GPU. We mark
729 // this as uncacheable from the CPU so that we can implement
730 // direct CPU framebuffer access similar to what we currently
731 // offer in real HW through the so-called Large BAR feature.
732 pa_addr = process->seWorkload->allocPhysPages(
733 npages, dGPUPoolID);
734 //
735 // TODO: Uncacheable accesses need to be supported by the
736 // CPU-side protocol for this to work correctly. I believe
737 // it only works right now if the physical memory is MMIO
738 cacheable = false;
739
740 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
741 "%d\n", args->va_addr, pa_addr, args->size);
742
743 } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
744 DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
745 mmap_offset = args->mmap_offset;
746 // USERPTR allocations are system memory mapped into GPUVM
747 // space. The user provides the driver with the pointer.
748 pa_addr = process->seWorkload->allocPhysPages(npages);
749
750 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
751 "%d\n", args->va_addr, pa_addr, args->size);
752
753 // If the HSA runtime requests system coherent memory, than we
754 // need to explicity mark this region as uncacheable from the
755 // perspective of the GPU.
756 if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
757 mtype.clear();
758
759 } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
760 DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
761 args->mmap_offset = args->va_addr;
762 // GTT allocations are system memory mapped into GPUVM space.
763 // It's different than a USERPTR allocation since the driver
764 // itself allocates the physical memory on the host.
765 //
766 // We will lazily map it into host memory on first touch. The
767 // fixupFault will find the original SVM aperture mapped to the
768 // host.
769 pa_addr = process->seWorkload->allocPhysPages(npages);
770
771 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
772 "%d\n", args->va_addr, pa_addr, args->size);
773
774 // If the HSA runtime requests system coherent memory, than we
775 // need to explicity mark this region as uncacheable from the
776 // perspective of the GPU.
777 if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
778 mtype.clear();
779
780 // Note that for GTT the thunk layer needs to call mmap on the
781 // driver FD later if it wants the host to have access to this
782 // memory (which it probably does). This will be ignored.
783 } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
784 DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
785 // DOORBELL allocations are the queue doorbells that are
786 // memory mapped into GPUVM space.
787 //
788 // Explicitly map this virtual address to our PIO doorbell
789 // interface in the page tables (non-cacheable)
790 pa_addr = device->hsaPacketProc().pioAddr;
791 cacheable = false;
792 }
793
794 DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
795 "size %lu, mmap_offset %p, gpu_id %d\n",
796 args->va_addr, args->size, mmap_offset, args->gpu_id);
797
798 // Bind selected physical memory to provided virtual address range
799 // in X86 page tables.
800 process->pTable->map(args->va_addr, pa_addr, args->size,
801 cacheable);
802
803 // We keep track of allocated regions of GPU mapped memory,
804 // just like the driver would. This allows us to provide the
805 // user with a unique handle for a given allocation. The user
806 // will only provide us with a handle after allocation and expect
807 // us to be able to use said handle to extract all the properties
808 // of the region.
809 //
810 // This is a simplified version of regular system VMAs, but for
811 // GPUVM space (none of the clobber/remap nonsense we find in real
812 // OS managed memory).
813 allocateGpuVma(mtype, args->va_addr, args->size);
814
815 // Used by the runtime to uniquely identify this allocation.
816 // We can just use the starting address of the VMA region.
817 args->handle= args->va_addr;
818 args.copyOut(virt_proxy);
819 }
820 break;
822 {
823 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
825 args.copyIn(virt_proxy);
826
827 assert(isdGPU);
828 DPRINTF(GPUDriver, "amdkfd free arguments: handle %p ",
829 args->handle);
830
831 // We don't recycle physical pages in SE mode
832 Addr size = deallocateGpuVma(args->handle);
833 process->pTable->unmap(args->handle, size);
834
835 // TODO: IOMMU and GPUTLBs do not seem to correctly support
836 // shootdown. This is also a potential issue for APU systems
837 // that perform unmap or remap with system memory.
838 tc->getMMUPtr()->flushAll();
839
840 args.copyOut(virt_proxy);
841 }
842 break;
851 {
852 warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
853 }
854 break;
856 {
857 warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
858 }
859 break;
861 {
862 warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
863 }
864 break;
866 {
867 warn("unimplemented ioctl: AMDKFD_IOC_GET_QUEUE_WAVE_STATE\n");
868 }
869 break;
871 {
872 warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
873 }
874 break;
876 {
877 warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
878 }
879 break;
881 {
882 warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_QUEUE_GWS\n");
883 }
884 break;
886 {
887 warn("unimplemented ioctl: AMDKFD_IOC_SMI_EVENTS\n");
888 }
889 break;
890 default:
891 fatal("%s: bad ioctl %d\n", req);
892 break;
893 }
894 return 0;
895}
896
897void
898GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
899{
900 // Convert millisecs to ticks
901 Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
902 assert(TCEvents.count(tc) == 1);
903 TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
904 tc->suspend();
905 DPRINTF(GPUDriver,
906 "CPU %d is put to sleep\n", tc->cpuId());
907}
908
909Addr
911{
912 return ((Addr)gpuNum << 61) + 0x1000000000000L;
913}
914
915Addr
917{
918 return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
919}
920
921Addr
923{
924 return ((Addr)gpuNum << 61) + 0x100000000L;
925}
926
927// Used for GFX9 devices
928// From drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c in the Linux kernel
929Addr
931{
932 return ((Addr)0x1 << 48);
933}
934
935Addr
937{
938 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
939}
940
941Addr
943{
944 return ((Addr)gpuNum << 61) + 0x0;
945}
946
947//Used for GFX9 devices
948// From drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c in the Linux kernel
949Addr
951{
952 return ((Addr)0x2 << 48);
953}
954
955Addr
957{
958 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
959}
960
961void
963 Addr start, Addr length)
964{
965 AddrRange range = AddrRange(start, start + length);
966 DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
967 range.start(), range.end(), mtype);
968 fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),
969 "Attempted to double register Mtypes for [%p - %p]\n",
970 range.start(), range.end());
971}
972
973Addr
975{
976 auto vma = gpuVmas.contains(start);
977 assert(vma != gpuVmas.end());
978 assert((vma->first.start() == start));
979 Addr size = vma->first.size();
980 DPRINTF(GPUDriver, "Unregistering [%p - %p]\n", vma->first.start(),
981 vma->first.end());
982 gpuVmas.erase(vma);
983 return size;
984}
985
986void
988{
989 // If we are a dGPU then set the MTYPE from our VMAs.
990 if (isdGPU) {
991 assert(!FullSystem);
992 AddrRange range = RangeSize(req->getVaddr(), req->getSize());
993 auto vma = gpuVmas.contains(range);
994 assert(vma != gpuVmas.end());
995 DPRINTF(GPUShader, "Setting req from [%p - %p] MTYPE %d\n"
996 "%d\n", range.start(), range.end(), vma->second);
997 req->setCacheCoherenceFlags(vma->second);
998 // APUs always get the default MTYPE
999 } else {
1000 req->setCacheCoherenceFlags(defaultMtype);
1001 }
1002}
1003
1004} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
Definition addr_range.hh:82
bool copyIn(const PortProxy &memproxy)
copy data into simulator space (read from target memory)
const int size
buffer size
bool copyOut(const PortProxy &memproxy)
copy data out of simulator space (write to target memory)
virtual void flushAll()
Definition mmu.cc:81
EmulatedDriver is an abstract base class for fake SE-mode device drivers.
const std::string & filename
filename for opening this driver (under /dev)
HSAPacketProcessor & hsaPacketProc()
void attachDriver(GPUComputeDriver *driver)
const char * description() const override
Return a C string describing the event.
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length)
Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs.
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
virtual void signalWakeupEvent(uint32_t event_id)
int open(ThreadContext *tc, int mode, int flags) override
Create an FD entry for the KFD inside of the owning process.
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override
Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a pre...
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
Addr scratchApeLimit(Addr apeBase) const
GPUComputeDriver(const Params &p)
Addr deallocateGpuVma(Addr start)
Addr scratchApeBase(int gpuNum) const
std::unordered_map< ThreadContext *, EventList > TCEvents
Addr gpuVmApeBase(int gpuNum) const
The aperture (APE) base/limit pairs are set statically at startup by the real KFD.
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr)
Forward relevant parameters to packet processor; queueId is used to link doorbell.
Request::CacheCoherenceFlags defaultMtype
GPUComputeDriverParams Params
std::unordered_map< uint32_t, ETEntry > ETable
Addr mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) override
Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbell...
GPUCommandProcessor * device
GPU that is controlled by this driver.
AddrRangeMap< Request::CacheCoherenceFlags, 1 > gpuVmas
VMA structures for GPUVM memory.
Addr ldsApeBase(int gpuNum) const
Addr ldsApeLimit(Addr apeBase) const
Addr gpuVmApeLimit(Addr apeBase) const
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
virtual std::string name() const
Definition named.hh:47
This object is a proxy for a port or other object which implements the functional response protocol,...
Definition port_proxy.hh:87
std::shared_ptr< MemState > memState
Definition process.hh:289
@ CACHED
mtype flags
Definition request.hh:336
ThreadContext is the external interface to all thread state for anything outside of the CPU.
virtual BaseMMU * getMMUPtr()=0
virtual Process * getProcessPtr()=0
virtual void suspend()=0
Set the status to Suspended.
virtual int cpuId() const =0
This proxy attempts to translate virtual addresses using the TLBs.
TypedBufferArg is a class template; instances of this template represent typed buffers in target user...
STL pair class.
Definition stl.hh:58
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
The GPUComputeDriver implements an HSADriver for an HSA AMD GPU agent.
AddrRange RangeSize(Addr start, Addr size)
Addr end() const
Get the end address of the range.
Addr start() const
Get the start address of the range.
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
void set(Type mask)
Set all flag's bits matching the given mask.
Definition flags.hh:116
void clear()
Clear all flag's bits.
Definition flags.hh:102
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
uint8_t flags
Definition helpers.cc:87
#define KFD_MMAP_TYPE_DOORBELL
#define SLOTS_PER_PAGE
#define KFD_MMAP_TYPE_MASK
#define KFD_MMAP_TYPE_EVENTS
#define PAGE_SHIFT
#define KFD_MMAP_GPU_ID(gpu_id)
#define AMDKFD_IOC_RESET_EVENT
Definition kfd_ioctl.h:553
#define AMDKFD_IOC_GET_CLOCK_COUNTERS
Definition kfd_ioctl.h:535
#define AMDKFD_IOC_GET_DMABUF_INFO
Definition kfd_ioctl.h:605
#define AMDKFD_IOC_IMPORT_DMABUF
Definition kfd_ioctl.h:608
#define KFD_IOCTL_MAJOR_VERSION
Definition kfd_ioctl.h:38
#define AMDKFD_IOC_SET_MEMORY_POLICY
Definition kfd_ioctl.h:532
#define AMDKFD_IOC_GET_VERSION
Definition kfd_ioctl.h:523
#define AMDKFD_IOC_DESTROY_EVENT
Definition kfd_ioctl.h:547
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL
Definition kfd_ioctl.h:379
#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA
Definition kfd_ioctl.h:571
#define KFD_SIGNAL_EVENT_LIMIT
Definition kfd_ioctl.h:229
#define AMDKFD_IOC_DBG_REGISTER
Definition kfd_ioctl.h:559
#define AMDKFD_IOC_ACQUIRE_VM
Definition kfd_ioctl.h:584
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT
Definition kfd_ioctl.h:387
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW
Definition kfd_ioctl.h:580
#define KFD_IOC_EVENT_SIGNAL
Definition kfd_ioctl.h:215
#define AMDKFD_IOC_CREATE_EVENT
Definition kfd_ioctl.h:544
#define AMDKFD_IOC_WAIT_EVENTS
Definition kfd_ioctl.h:556
#define AMDKFD_IOC_DESTROY_QUEUE
Definition kfd_ioctl.h:529
#define AMDKFD_IOC_SMI_EVENTS
Definition kfd_ioctl.h:614
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR
Definition kfd_ioctl.h:378
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU
Definition kfd_ioctl.h:587
#define AMDKFD_IOC_GET_TILE_CONFIG
Definition kfd_ioctl.h:574
#define AMDKFD_IOC_SET_EVENT
Definition kfd_ioctl.h:550
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU
Definition kfd_ioctl.h:593
#define AMDKFD_IOC_DBG_UNREGISTER
Definition kfd_ioctl.h:562
#define AMDKFD_IOC_SET_CU_MASK
Definition kfd_ioctl.h:599
#define AMDKFD_IOC_CREATE_QUEUE
Definition kfd_ioctl.h:526
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU
Definition kfd_ioctl.h:590
#define KFD_IOCTL_MINOR_VERSION
Definition kfd_ioctl.h:39
#define AMDKFD_IOC_GET_PROCESS_APERTURES
Definition kfd_ioctl.h:538
#define AMDKFD_IOC_DBG_WAVE_CONTROL
Definition kfd_ioctl.h:568
#define AMDKFD_IOC_UPDATE_QUEUE
Definition kfd_ioctl.h:541
#define AMDKFD_IOC_DBG_ADDRESS_WATCH
Definition kfd_ioctl.h:565
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT
Definition kfd_ioctl.h:377
#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE
Definition kfd_ioctl.h:602
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM
Definition kfd_ioctl.h:376
#define AMDKFD_IOC_ALLOC_QUEUE_GWS
Definition kfd_ioctl.h:611
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU
Definition kfd_ioctl.h:596
#define AMDKFD_IOC_SET_TRAP_HANDLER
Definition kfd_ioctl.h:577
#define warn(...)
Definition logging.hh:256
#define warn_once(...)
Definition logging.hh:260
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Bitfield< 7 > vma
Definition vector.hh:79
Bitfield< 7 > prot
Definition misc.hh:597
const Addr PageBytes
Definition page_size.hh:49
Tick ns
nanosecond
Definition core.cc:68
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
Definition binary32.hh:36
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
PortProxy Object Declaration.
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Tue Jun 18 2024 16:24:04 for gem5 by doxygen 1.11.0