gem5 v23.0.0.1
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
gpu_compute_driver.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
33
34#include <memory>
35
36#include "arch/x86/page_size.hh"
37#include "base/compiler.hh"
38#include "base/logging.hh"
39#include "base/trace.hh"
40#include "cpu/thread_context.hh"
41#include "debug/GPUDriver.hh"
42#include "debug/GPUShader.hh"
45#include "dev/hsa/kfd_ioctl.h"
47#include "gpu-compute/shader.hh"
48#include "mem/port_proxy.hh"
51#include "params/GPUComputeDriver.hh"
52#include "sim/full_system.hh"
53#include "sim/process.hh"
54#include "sim/se_workload.hh"
56
57namespace gem5
58{
59
61 : EmulatedDriver(p), device(p.device), queueId(0),
62 isdGPU(p.isdGPU), gfxVersion(p.gfxVersion), dGPUPoolID(p.dGPUPoolID),
63 eventPage(0), eventSlotIndex(0)
64{
65 device->attachDriver(this);
66 DPRINTF(GPUDriver, "Constructing KFD: device\n");
67
68 // Convert the 3 bit mtype specified in Shader.py to the proper type
69 // used for requests.
70 std::bitset<MtypeFlags::NUM_MTYPE_BITS> mtype(p.m_type);
71 if (mtype.test(MtypeFlags::SHARED)) {
73 }
74
75 if (mtype.test(MtypeFlags::READ_WRITE)) {
77 }
78
79 if (mtype.test(MtypeFlags::CACHED)) {
81 }
82}
83
84const char*
86{
87 return "DriverWakeupEvent";
88}
89
93int
95{
96 DPRINTF(GPUDriver, "Opened %s\n", filename);
97 auto process = tc->getProcessPtr();
98 auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
99 int tgt_fd = process->fds->allocFD(device_fd_entry);
100 return tgt_fd;
101}
102
107Addr
108GPUComputeDriver::mmap(ThreadContext *tc, Addr start, uint64_t length,
109 int prot, int tgt_flags, int tgt_fd, off_t offset)
110{
111 auto process = tc->getProcessPtr();
112 auto mem_state = process->memState;
113
114 Addr pg_off = offset >> PAGE_SHIFT;
115 Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
116 DPRINTF(GPUDriver, "amdkfd mmap (start: %p, length: 0x%x,"
117 "offset: 0x%x)\n", start, length, offset);
118
119 switch(mmap_type) {
121 DPRINTF(GPUDriver, "amdkfd mmap type DOORBELL offset\n");
122 start = mem_state->extendMmap(length);
123 process->pTable->map(start, device->hsaPacketProc().pioAddr,
124 length, false);
125 break;
127 DPRINTF(GPUDriver, "amdkfd mmap type EVENTS offset\n");
128 panic_if(start != 0,
129 "Start address should be provided by KFD\n");
130 panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
131 "Requested length %d, expected length %d; length "
132 "mismatch\n", length, 8* KFD_SIGNAL_EVENT_LIMIT);
138 if (!eventPage) {
139 eventPage = mem_state->extendMmap(length);
140 start = eventPage;
141 }
142 break;
143 default:
144 warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
145 break;
146 }
147
148 return start;
149}
150
158void
160{
162 args.copyIn(mem_proxy);
163
164 if ((doorbellSize() * queueId) > 4096) {
165 fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
166 }
167
168 args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
169 KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
170
171 // for vega offset needs to include exact value of doorbell
172 if (doorbellSize())
173 args->doorbell_offset += queueId * doorbellSize();
174
175 args->queue_id = queueId++;
176 auto &hsa_pp = device->hsaPacketProc();
177 hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
178 args->ring_base_address, args->queue_id,
179 args->ring_size, doorbellSize(), gfxVersion);
180 args.copyOut(mem_proxy);
181}
182
183void
185{
186 assert(driver);
187 driver->schedule(this, curTick() + wakeup_delay);
188}
189
190void
192{
193 panic_if(event_id >= eventSlotIndex,
194 "Trying wakeup on an event that is not yet created\n");
195 if (ETable[event_id].threadWaiting) {
196 panic_if(!ETable[event_id].tc,
197 "No thread context to wake up\n");
198 ThreadContext *tc = ETable[event_id].tc;
199 DPRINTF(GPUDriver,
200 "Signal event: Waking up CPU %d\n", tc->cpuId());
201 // Remove events that can wakeup this thread
202 TCEvents[tc].clearEvents();
203 // Now wakeup this thread
204 tc->activate();
205 } else {
206 // This may be a race condition between an ioctl call asking to wait on
207 // this event and this signalWakeupEvent. Taking care of this race
208 // condition here by setting the event here. The ioctl call should take
209 // the necessary action when waiting on an already set event. However,
210 // this may be a genuine instance in which the runtime has decided not
211 // to wait on this event. But since we cannot distinguish this case with
212 // the race condition, we are any way setting the event.
213 ETable[event_id].setEvent = true;
214 }
215}
216
217void
219{
220 DPRINTF(GPUDriver,
221 "Timer event: Waking up CPU %d\n", tc->cpuId());
222 // Remove events that can wakeup this thread
223 driver->TCEvents[tc].clearEvents();
224 // Now wakeup this thread
225 tc->activate();
226}
227
228int
229GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
230{
231 TranslatingPortProxy fs_proxy(tc);
232 SETranslatingPortProxy se_proxy(tc);
233 PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
234 auto process = tc->getProcessPtr();
235 auto mem_state = process->memState;
236
237 switch (req) {
239 {
240 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
241
243 args->major_version = KFD_IOCTL_MAJOR_VERSION;
244 args->minor_version = KFD_IOCTL_MINOR_VERSION;
245
246 args.copyOut(virt_proxy);
247 }
248 break;
250 {
251 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
252
253 allocateQueue(virt_proxy, ioc_buf);
254
255 DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
256 }
257 break;
259 {
261 args.copyIn(virt_proxy);
262 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
263 "queue offset %d\n", args->queue_id);
264 device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id,
265 doorbellSize());
266 }
267 break;
269 {
283 warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
284 }
285 break;
287 {
288 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
289
291 args.copyIn(virt_proxy);
292
293 // Set nanosecond resolution
294 args->system_clock_freq = 1000000000;
295
300 uint64_t elapsed_nsec = curTick() / sim_clock::as_int::ns;
301 args->gpu_clock_counter = elapsed_nsec;
302 args->cpu_clock_counter = elapsed_nsec;
303 args->system_clock_counter = elapsed_nsec;
304
305 args.copyOut(virt_proxy);
306 }
307 break;
309 {
310 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
311
313 args->num_of_nodes = 1;
314
321 for (int i = 0; i < args->num_of_nodes; ++i) {
329 switch (gfxVersion) {
330 case GfxVersion::gfx801:
331 case GfxVersion::gfx803:
332 args->process_apertures[i].scratch_base =
333 scratchApeBase(i + 1);
334 args->process_apertures[i].lds_base =
335 ldsApeBase(i + 1);
336 break;
337 case GfxVersion::gfx900:
338 case GfxVersion::gfx902:
339 args->process_apertures[i].scratch_base =
341 args->process_apertures[i].lds_base =
342 ldsApeBaseV9();
343 break;
344 default:
345 fatal("Invalid gfx version\n");
346 }
347
348 // GFX8 and GFX9 set lds and scratch limits the same way
349 args->process_apertures[i].scratch_limit =
350 scratchApeLimit(args->process_apertures[i].scratch_base);
351
352 args->process_apertures[i].lds_limit =
353 ldsApeLimit(args->process_apertures[i].lds_base);
354
355 switch (gfxVersion) {
356 case GfxVersion::gfx801:
357 args->process_apertures[i].gpuvm_base =
358 gpuVmApeBase(i + 1);
359 args->process_apertures[i].gpuvm_limit =
360 gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
361 break;
362 case GfxVersion::gfx803:
363 case GfxVersion::gfx900:
364 case GfxVersion::gfx902:
365 // Taken from SVM_USE_BASE in Linux kernel
366 args->process_apertures[i].gpuvm_base = 0x1000000ull;
367 // Taken from AMDGPU_GMC_HOLE_START in Linux kernel
368 args->process_apertures[i].gpuvm_limit =
369 0x0000800000000000ULL - 1;
370 break;
371 default:
372 fatal("Invalid gfx version");
373 }
374
375 // NOTE: Must match ID populated by hsaTopology.py
376 //
377 // https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/
378 // blob/6a986c0943e9acd8c4c0cf2a9d510ff42167b43f/include/uapi/
379 // linux/kfd_ioctl.h#L564
380 //
381 // The gpu_id is a device identifier used by the driver for
382 // ioctls that allocate arguments. Each device has an unique
383 // id composed out of a non-zero base and an offset.
384 if (isdGPU) {
385 switch (gfxVersion) {
386 case GfxVersion::gfx803:
387 args->process_apertures[i].gpu_id = 50156;
388 break;
389 case GfxVersion::gfx900:
390 args->process_apertures[i].gpu_id = 22124;
391 break;
392 default:
393 fatal("Invalid gfx version for dGPU\n");
394 }
395 } else {
396 switch (gfxVersion) {
397 case GfxVersion::gfx801:
398 case GfxVersion::gfx902:
399 args->process_apertures[i].gpu_id = 2765;
400 break;
401 default:
402 fatal("Invalid gfx version for APU\n");
403 }
404 }
405
406 DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
407 args->process_apertures[i].gpuvm_base);
408 DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
409 args->process_apertures[i].gpuvm_limit);
410
411 DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
412 args->process_apertures[i].lds_base);
413 DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
414 args->process_apertures[i].lds_limit);
415
416 DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
417 args->process_apertures[i].scratch_base);
418 DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
419 args->process_apertures[i].scratch_limit);
420
427 assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
428 47) != 0x1ffff);
429 assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
430 47) != 0);
431 assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
432 47) != 0x1ffff);
433 assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
434 47) != 0);
435 assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
436 47) != 0x1ffff);
437 assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
438 47) != 0);
439 assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
440 47) != 0x1ffff);
441 assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
442 47) != 0);
443 }
444
445 args.copyOut(virt_proxy);
446 }
447 break;
449 {
450 warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
451 }
452 break;
454 {
455 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n");
456
458 args.copyIn(virt_proxy);
459 if (args->event_type != KFD_IOC_EVENT_SIGNAL) {
460 warn("Signal events are only supported currently\n");
461 } else if (eventSlotIndex == SLOTS_PER_PAGE) {
462 fatal("Signal event wasn't created; signal limit reached\n");
463 }
464 // Currently, we allocate only one signal_page for events.
465 // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT
466 uint64_t page_index = 0;
467 args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS);
468 args->event_page_offset <<= PAGE_SHIFT;
469 // TODO: Currently we support only signal events, hence using
470 // the same ID for both signal slot and event slot
471 args->event_slot_index = eventSlotIndex;
472 args->event_id = eventSlotIndex++;
473 args->event_trigger_data = args->event_id;
474 DPRINTF(GPUDriver, "amdkfd create events"
475 "(event_id: 0x%x, offset: 0x%x)\n",
476 args->event_id, args->event_page_offset);
477 // Since eventSlotIndex is increased everytime a new event is
478 // created ETable at eventSlotIndex(event_id) is guaranteed to be
479 // empty. In a future implementation that reuses deleted event_ids,
480 // we should check if event table at this
481 // eventSlotIndex(event_id) is empty before inserting a new event
482 // table entry
483 ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {}));
484 args.copyOut(virt_proxy);
485 }
486 break;
488 {
489 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
491 args.copyIn(virt_proxy);
492 DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id);
493 fatal_if(ETable.count(args->event_id) == 0,
494 "Event ID invalid, cannot destroy this event\n");
495 ETable.erase(args->event_id);
496 }
497 break;
499 {
500 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n");
502 args.copyIn(virt_proxy);
503 DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id);
504 fatal_if(ETable.count(args->event_id) == 0,
505 "Event ID invlaid, cannot set this event\n");
506 ETable[args->event_id].setEvent = true;
507 signalWakeupEvent(args->event_id);
508 }
509 break;
511 {
512 warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
513 }
514 break;
516 {
517 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
519 args.copyIn(virt_proxy);
520 kfd_event_data *events =
521 (kfd_event_data *)args->events_ptr;
522 DPRINTF(GPUDriver, "amdkfd wait for events"
523 "(wait on all: %d, timeout : %d, num_events: %s)\n",
524 args->wait_for_all, args->timeout, args->num_events);
525 panic_if(args->wait_for_all != 0 && args->num_events > 1,
526 "Wait for all events not supported\n");
527 bool should_sleep = true;
528 if (TCEvents.count(tc) == 0) {
529 // This thread context trying to wait on an event for the first
530 // time, initialize it.
531 TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
532 std::make_tuple(this, tc));
533 DPRINTF(GPUDriver, "\tamdkfd creating event list"
534 " for thread %d\n", tc->cpuId());
535 }
536 panic_if(TCEvents[tc].signalEvents.size() != 0,
537 "There are %d events that put this thread to sleep,"
538 " this thread should not be running\n",
539 TCEvents[tc].signalEvents.size());
540 for (int i = 0; i < args->num_events; i++) {
541 panic_if(!events,
542 "Event pointer invalid\n");
543 Addr eventDataAddr = (Addr)(events + i);
545 eventDataAddr, sizeof(kfd_event_data));
546 EventData.copyIn(virt_proxy);
547 DPRINTF(GPUDriver,
548 "\tamdkfd wait for event %d\n", EventData->event_id);
549 panic_if(ETable.count(EventData->event_id) == 0,
550 "Event ID invalid, cannot set this event\n");
551 if (ETable[EventData->event_id].threadWaiting)
552 warn("Multiple threads waiting on the same event\n");
553 if (ETable[EventData->event_id].setEvent) {
554 // If event is already set, the event has already happened.
555 // Just unset the event and dont put this thread to sleep.
556 ETable[EventData->event_id].setEvent = false;
557 should_sleep = false;
558 }
559 if (should_sleep) {
560 // Put this thread to sleep
561 ETable[EventData->event_id].threadWaiting = true;
562 ETable[EventData->event_id].tc = tc;
563 TCEvents[tc].signalEvents.insert(EventData->event_id);
564 }
565 }
566
567 // TODO: Return the correct wait_result back. Currently, returning
568 // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE.
569 // Ideally, this needs to be done after the event is triggered and
570 // after the thread is woken up.
571 args->wait_result = 0;
572 args.copyOut(virt_proxy);
573 if (should_sleep) {
574 // Put this thread to sleep
575 sleepCPU(tc, args->timeout);
576 } else {
577 // Remove events that tried to put this thread to sleep
578 TCEvents[tc].clearEvents();
579 }
580 }
581 break;
583 {
584 warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
585 }
586 break;
588 {
589 warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
590 }
591 break;
593 {
594 warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
595 }
596 break;
598 {
599 warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
600 }
601 break;
603 {
604 warn("unimplemented ioctl: AMDKFD_IOC_SET_SCRATCH_BACKING_VA\n");
605 }
606 break;
608 {
609 warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
610 }
611 break;
613 {
614 warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
615 }
616 break;
618 {
619 DPRINTF(GPUDriver,
620 "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
621
623 ioc_args(ioc_buf);
624
625 ioc_args.copyIn(virt_proxy);
626 ioc_args->num_of_nodes = 1;
627
628 for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
630 (ioc_args->kfd_process_device_apertures_ptr);
631
632 switch (gfxVersion) {
633 case GfxVersion::gfx801:
634 case GfxVersion::gfx803:
635 ape_args->scratch_base = scratchApeBase(i + 1);
636 ape_args->lds_base = ldsApeBase(i + 1);
637 break;
638 case GfxVersion::gfx900:
639 case GfxVersion::gfx902:
640 ape_args->scratch_base = scratchApeBaseV9();
641 ape_args->lds_base = ldsApeBaseV9();
642 break;
643 default:
644 fatal("Invalid gfx version\n");
645 }
646
647 // GFX8 and GFX9 set lds and scratch limits the same way
648 ape_args->scratch_limit =
649 scratchApeLimit(ape_args->scratch_base);
650 ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
651
652 switch (gfxVersion) {
653 case GfxVersion::gfx801:
654 ape_args->gpuvm_base = gpuVmApeBase(i + 1);
655 ape_args->gpuvm_limit =
656 gpuVmApeLimit(ape_args->gpuvm_base);
657 break;
658 case GfxVersion::gfx803:
659 case GfxVersion::gfx900:
660 case GfxVersion::gfx902:
661 // Taken from SVM_USE_BASE in Linux kernel
662 ape_args->gpuvm_base = 0x1000000ull;
663 // Taken from AMDGPU_GMC_HOLE_START in Linux kernel
664 ape_args->gpuvm_limit = 0x0000800000000000ULL - 1;
665 break;
666 default:
667 fatal("Invalid gfx version\n");
668 }
669
670 // NOTE: Must match ID populated by hsaTopology.py
671 if (isdGPU) {
672 switch (gfxVersion) {
673 case GfxVersion::gfx803:
674 ape_args->gpu_id = 50156;
675 break;
676 case GfxVersion::gfx900:
677 ape_args->gpu_id = 22124;
678 break;
679 default:
680 fatal("Invalid gfx version for dGPU\n");
681 }
682 } else {
683 switch (gfxVersion) {
684 case GfxVersion::gfx801:
685 case GfxVersion::gfx902:
686 ape_args->gpu_id = 2765;
687 break;
688 default:
689 fatal("Invalid gfx version for APU\n");
690 }
691 }
692
693 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
694 assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
695 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
696 assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
697 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
698 assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
699 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
700 assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
701
702 ape_args.copyOut(virt_proxy);
703 }
704
705 ioc_args.copyOut(virt_proxy);
706 }
707 break;
709 {
710 warn("unimplemented ioctl: AMDKFD_IOC_ACQUIRE_VM\n");
711 }
712 break;
730 {
731 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
733 args.copyIn(virt_proxy);
734
735 assert(isdGPU || gfxVersion == GfxVersion::gfx902);
736 assert((args->va_addr % X86ISA::PageBytes) == 0);
737 [[maybe_unused]] Addr mmap_offset = 0;
738
740 Addr pa_addr = 0;
741
742 int npages = divCeil(args->size, (int64_t)X86ISA::PageBytes);
743 bool cacheable = true;
744
745 if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
746 DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
747 args->mmap_offset = args->va_addr;
748 // VRAM allocations are device memory mapped into GPUVM
749 // space.
750 //
751 // We can't rely on the lazy host allocator (fixupFault) to
752 // handle this mapping since it needs to be placed in dGPU
753 // framebuffer memory. The lazy allocator will try to place
754 // this in host memory.
755 //
756 // TODO: We don't have the appropriate bifurcation of the
757 // physical address space with different memory controllers
758 // yet. This is where we will explicitly add the PT maps to
759 // dGPU memory in the future.
760 //
761 // Bind the VA space to the dGPU physical memory pool. Mark
762 // this region as Uncacheable. The Uncacheable flag is only
763 // really used by the CPU and is ignored by the GPU. We mark
764 // this as uncacheable from the CPU so that we can implement
765 // direct CPU framebuffer access similar to what we currently
766 // offer in real HW through the so-called Large BAR feature.
767 pa_addr = process->seWorkload->allocPhysPages(
768 npages, dGPUPoolID);
769 //
770 // TODO: Uncacheable accesses need to be supported by the
771 // CPU-side protocol for this to work correctly. I believe
772 // it only works right now if the physical memory is MMIO
773 cacheable = false;
774
775 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
776 "%d\n", args->va_addr, pa_addr, args->size);
777
778 } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
779 DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
780 mmap_offset = args->mmap_offset;
781 // USERPTR allocations are system memory mapped into GPUVM
782 // space. The user provides the driver with the pointer.
783 pa_addr = process->seWorkload->allocPhysPages(npages);
784
785 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
786 "%d\n", args->va_addr, pa_addr, args->size);
787
788 // If the HSA runtime requests system coherent memory, than we
789 // need to explicity mark this region as uncacheable from the
790 // perspective of the GPU.
791 if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
792 mtype.clear();
793
794 } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
795 DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
796 args->mmap_offset = args->va_addr;
797 // GTT allocations are system memory mapped into GPUVM space.
798 // It's different than a USERPTR allocation since the driver
799 // itself allocates the physical memory on the host.
800 //
801 // We will lazily map it into host memory on first touch. The
802 // fixupFault will find the original SVM aperture mapped to the
803 // host.
804 pa_addr = process->seWorkload->allocPhysPages(npages);
805
806 DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
807 "%d\n", args->va_addr, pa_addr, args->size);
808
809 // If the HSA runtime requests system coherent memory, than we
810 // need to explicity mark this region as uncacheable from the
811 // perspective of the GPU.
812 if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
813 mtype.clear();
814
815 // Note that for GTT the thunk layer needs to call mmap on the
816 // driver FD later if it wants the host to have access to this
817 // memory (which it probably does). This will be ignored.
818 } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
819 DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
820 // DOORBELL allocations are the queue doorbells that are
821 // memory mapped into GPUVM space.
822 //
823 // Explicitly map this virtual address to our PIO doorbell
824 // interface in the page tables (non-cacheable)
825 pa_addr = device->hsaPacketProc().pioAddr;
826 cacheable = false;
827 }
828
829 DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
830 "size %lu, mmap_offset %p, gpu_id %d\n",
831 args->va_addr, args->size, mmap_offset, args->gpu_id);
832
833 // Bind selected physical memory to provided virtual address range
834 // in X86 page tables.
835 process->pTable->map(args->va_addr, pa_addr, args->size,
836 cacheable);
837
838 // We keep track of allocated regions of GPU mapped memory,
839 // just like the driver would. This allows us to provide the
840 // user with a unique handle for a given allocation. The user
841 // will only provide us with a handle after allocation and expect
842 // us to be able to use said handle to extract all the properties
843 // of the region.
844 //
845 // This is a simplified version of regular system VMAs, but for
846 // GPUVM space (none of the clobber/remap nonsense we find in real
847 // OS managed memory).
848 allocateGpuVma(mtype, args->va_addr, args->size);
849
850 // Used by the runtime to uniquely identify this allocation.
851 // We can just use the starting address of the VMA region.
852 args->handle= args->va_addr;
853 args.copyOut(virt_proxy);
854 }
855 break;
857 {
858 DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
860 args.copyIn(virt_proxy);
861
862 assert(isdGPU);
863 DPRINTF(GPUDriver, "amdkfd free arguments: handle %p ",
864 args->handle);
865
866 // We don't recycle physical pages in SE mode
867 Addr size = deallocateGpuVma(args->handle);
868 process->pTable->unmap(args->handle, size);
869
870 // TODO: IOMMU and GPUTLBs do not seem to correctly support
871 // shootdown. This is also a potential issue for APU systems
872 // that perform unmap or remap with system memory.
873 tc->getMMUPtr()->flushAll();
874
875 args.copyOut(virt_proxy);
876 }
877 break;
886 {
887 warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
888 }
889 break;
891 {
892 warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
893 }
894 break;
896 {
897 warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
898 }
899 break;
901 {
902 warn("unimplemented ioctl: AMDKFD_IOC_GET_QUEUE_WAVE_STATE\n");
903 }
904 break;
906 {
907 warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
908 }
909 break;
911 {
912 warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
913 }
914 break;
916 {
917 warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_QUEUE_GWS\n");
918 }
919 break;
921 {
922 warn("unimplemented ioctl: AMDKFD_IOC_SMI_EVENTS\n");
923 }
924 break;
925 default:
926 fatal("%s: bad ioctl %d\n", req);
927 break;
928 }
929 return 0;
930}
931
932void
933GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
934{
935 // Convert millisecs to ticks
936 Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
937 assert(TCEvents.count(tc) == 1);
938 TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
939 tc->suspend();
940 DPRINTF(GPUDriver,
941 "CPU %d is put to sleep\n", tc->cpuId());
942}
943
944Addr
946{
947 return ((Addr)gpuNum << 61) + 0x1000000000000L;
948}
949
950Addr
952{
953 return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
954}
955
956Addr
958{
959 return ((Addr)gpuNum << 61) + 0x100000000L;
960}
961
962// Used for GFX9 devices
963// From drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c in the Linux kernel
964Addr
966{
967 return ((Addr)0x1 << 48);
968}
969
970Addr
972{
973 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
974}
975
976Addr
978{
979 return ((Addr)gpuNum << 61) + 0x0;
980}
981
982//Used for GFX9 devices
983// From drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c in the Linux kernel
984Addr
986{
987 return ((Addr)0x2 << 48);
988}
989
990Addr
992{
993 return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
994}
995
996void
998 Addr start, Addr length)
999{
1000 AddrRange range = AddrRange(start, start + length);
1001 DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
1002 range.start(), range.end(), mtype);
1003 fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),
1004 "Attempted to double register Mtypes for [%p - %p]\n",
1005 range.start(), range.end());
1006}
1007
1008Addr
1010{
1011 auto vma = gpuVmas.contains(start);
1012 assert(vma != gpuVmas.end());
1013 assert((vma->first.start() == start));
1014 Addr size = vma->first.size();
1015 DPRINTF(GPUDriver, "Unregistering [%p - %p]\n", vma->first.start(),
1016 vma->first.end());
1017 gpuVmas.erase(vma);
1018 return size;
1019}
1020
1021void
1023{
1024 // If we are a dGPU then set the MTYPE from our VMAs.
1025 if (isdGPU) {
1026 assert(!FullSystem);
1027 AddrRange range = RangeSize(req->getVaddr(), req->getSize());
1028 auto vma = gpuVmas.contains(range);
1029 assert(vma != gpuVmas.end());
1030 DPRINTF(GPUShader, "Setting req from [%p - %p] MTYPE %d\n"
1031 "%d\n", range.start(), range.end(), vma->second);
1032 req->setCacheCoherenceFlags(vma->second);
1033 // APUs always get the default MTYPE
1034 } else {
1035 req->setCacheCoherenceFlags(defaultMtype);
1036 }
1037}
1038
1039} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:210
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
Definition addr_range.hh:82
bool copyIn(const PortProxy &memproxy)
copy data into simulator space (read from target memory)
const int size
buffer size
bool copyOut(const PortProxy &memproxy)
copy data out of simulator space (write to target memory)
virtual void flushAll()
Definition mmu.cc:81
EmulatedDriver is an abstract base class for fake SE-mode device drivers.
const std::string & filename
filename for opening this driver (under /dev)
HSAPacketProcessor & hsaPacketProc()
void attachDriver(GPUComputeDriver *driver)
const char * description() const override
Return a C string describing the event.
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length)
Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs.
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
virtual void signalWakeupEvent(uint32_t event_id)
int open(ThreadContext *tc, int mode, int flags) override
Create an FD entry for the KFD inside of the owning process.
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override
Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a pre...
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
Addr scratchApeLimit(Addr apeBase) const
GPUComputeDriver(const Params &p)
Addr deallocateGpuVma(Addr start)
Addr scratchApeBase(int gpuNum) const
std::unordered_map< ThreadContext *, EventList > TCEvents
Addr gpuVmApeBase(int gpuNum) const
The aperture (APE) base/limit pairs are set statically at startup by the real KFD.
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr)
Forward relevant parameters to packet processor; queueId is used to link doorbell.
Request::CacheCoherenceFlags defaultMtype
GPUComputeDriverParams Params
std::unordered_map< uint32_t, ETEntry > ETable
Addr mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) override
Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbell...
GPUCommandProcessor * device
GPU that is controlled by this driver.
AddrRangeMap< Request::CacheCoherenceFlags, 1 > gpuVmas
VMA structures for GPUVM memory.
Addr ldsApeBase(int gpuNum) const
Addr ldsApeLimit(Addr apeBase) const
Addr gpuVmApeLimit(Addr apeBase) const
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
virtual std::string name() const
Definition named.hh:47
This object is a proxy for a port or other object which implements the functional response protocol,...
Definition port_proxy.hh:87
std::shared_ptr< MemState > memState
Definition process.hh:289
@ CACHED
mtype flags
Definition request.hh:336
ThreadContext is the external interface to all thread state for anything outside of the CPU.
virtual BaseMMU * getMMUPtr()=0
virtual Process * getProcessPtr()=0
virtual void suspend()=0
Set the status to Suspended.
virtual int cpuId() const =0
This proxy attempts to translate virtual addresses using the TLBs.
TypedBufferArg is a class template; instances of this template represent typed buffers in target user...
STL pair class.
Definition stl.hh:58
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
The GPUComputeDriver implements an HSADriver for an HSA AMD GPU agent.
AddrRange RangeSize(Addr start, Addr size)
Addr end() const
Get the end address of the range.
Addr start() const
Get the start address of the range.
static constexpr T divCeil(const T &a, const U &b)
Definition intmath.hh:110
void set(Type mask)
Set all flag's bits matching the given mask.
Definition flags.hh:116
void clear()
Clear all flag's bits.
Definition flags.hh:102
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition logging.hh:236
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:200
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:214
uint8_t flags
Definition helpers.cc:66
#define KFD_MMAP_TYPE_DOORBELL
#define SLOTS_PER_PAGE
#define KFD_MMAP_TYPE_MASK
#define KFD_MMAP_TYPE_EVENTS
#define PAGE_SHIFT
#define KFD_MMAP_GPU_ID(gpu_id)
#define AMDKFD_IOC_RESET_EVENT
Definition kfd_ioctl.h:553
#define AMDKFD_IOC_GET_CLOCK_COUNTERS
Definition kfd_ioctl.h:535
#define AMDKFD_IOC_GET_DMABUF_INFO
Definition kfd_ioctl.h:605
#define AMDKFD_IOC_IMPORT_DMABUF
Definition kfd_ioctl.h:608
#define KFD_IOCTL_MAJOR_VERSION
Definition kfd_ioctl.h:38
#define AMDKFD_IOC_SET_MEMORY_POLICY
Definition kfd_ioctl.h:532
#define AMDKFD_IOC_GET_VERSION
Definition kfd_ioctl.h:523
#define AMDKFD_IOC_DESTROY_EVENT
Definition kfd_ioctl.h:547
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL
Definition kfd_ioctl.h:379
#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA
Definition kfd_ioctl.h:571
#define KFD_SIGNAL_EVENT_LIMIT
Definition kfd_ioctl.h:229
#define AMDKFD_IOC_DBG_REGISTER
Definition kfd_ioctl.h:559
#define AMDKFD_IOC_ACQUIRE_VM
Definition kfd_ioctl.h:584
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT
Definition kfd_ioctl.h:387
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW
Definition kfd_ioctl.h:580
#define KFD_IOC_EVENT_SIGNAL
Definition kfd_ioctl.h:215
#define AMDKFD_IOC_CREATE_EVENT
Definition kfd_ioctl.h:544
#define AMDKFD_IOC_WAIT_EVENTS
Definition kfd_ioctl.h:556
#define AMDKFD_IOC_DESTROY_QUEUE
Definition kfd_ioctl.h:529
#define AMDKFD_IOC_SMI_EVENTS
Definition kfd_ioctl.h:614
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR
Definition kfd_ioctl.h:378
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU
Definition kfd_ioctl.h:587
#define AMDKFD_IOC_GET_TILE_CONFIG
Definition kfd_ioctl.h:574
#define AMDKFD_IOC_SET_EVENT
Definition kfd_ioctl.h:550
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU
Definition kfd_ioctl.h:593
#define AMDKFD_IOC_DBG_UNREGISTER
Definition kfd_ioctl.h:562
#define AMDKFD_IOC_SET_CU_MASK
Definition kfd_ioctl.h:599
#define AMDKFD_IOC_CREATE_QUEUE
Definition kfd_ioctl.h:526
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU
Definition kfd_ioctl.h:590
#define KFD_IOCTL_MINOR_VERSION
Definition kfd_ioctl.h:39
#define AMDKFD_IOC_GET_PROCESS_APERTURES
Definition kfd_ioctl.h:538
#define AMDKFD_IOC_DBG_WAVE_CONTROL
Definition kfd_ioctl.h:568
#define AMDKFD_IOC_UPDATE_QUEUE
Definition kfd_ioctl.h:541
#define AMDKFD_IOC_DBG_ADDRESS_WATCH
Definition kfd_ioctl.h:565
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT
Definition kfd_ioctl.h:377
#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE
Definition kfd_ioctl.h:602
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM
Definition kfd_ioctl.h:376
#define AMDKFD_IOC_ALLOC_QUEUE_GWS
Definition kfd_ioctl.h:611
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU
Definition kfd_ioctl.h:596
#define AMDKFD_IOC_SET_TRAP_HANDLER
Definition kfd_ioctl.h:577
#define warn(...)
Definition logging.hh:256
#define warn_once(...)
Definition logging.hh:260
Bitfield< 4, 0 > mode
Definition misc_types.hh:74
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 0 > p
Bitfield< 7 > prot
Definition misc.hh:587
const Addr PageBytes
Definition page_size.hh:49
Tick ns
nanosecond
Definition core.cc:68
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition request.hh:94
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition root.cc:220
uint64_t Tick
Tick count type.
Definition types.hh:58
PortProxy Object Declaration.
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Mon Jul 10 2023 15:32:03 for gem5 by doxygen 1.9.7