gem5  v22.1.0.0
gpu_compute_driver.cc
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015-2018 Advanced Micro Devices, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  * this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * 3. Neither the name of the copyright holder nor the names of its
16  * contributors may be used to endorse or promote products derived from this
17  * software without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
33 
34 #include <memory>
35 
36 #include "arch/x86/page_size.hh"
37 #include "base/compiler.hh"
38 #include "base/logging.hh"
39 #include "base/trace.hh"
40 #include "cpu/thread_context.hh"
41 #include "debug/GPUDriver.hh"
42 #include "debug/GPUShader.hh"
45 #include "dev/hsa/kfd_ioctl.h"
47 #include "gpu-compute/shader.hh"
48 #include "mem/port_proxy.hh"
51 #include "params/GPUComputeDriver.hh"
52 #include "sim/full_system.hh"
53 #include "sim/process.hh"
54 #include "sim/se_workload.hh"
55 #include "sim/syscall_emul_buf.hh"
56 
57 namespace gem5
58 {
59 
61  : EmulatedDriver(p), device(p.device), queueId(0),
62  isdGPU(p.isdGPU), gfxVersion(p.gfxVersion), dGPUPoolID(p.dGPUPoolID),
63  eventPage(0), eventSlotIndex(0)
64 {
65  device->attachDriver(this);
66  DPRINTF(GPUDriver, "Constructing KFD: device\n");
67 
68  // Convert the 3 bit mtype specified in Shader.py to the proper type
69  // used for requests.
70  std::bitset<MtypeFlags::NUM_MTYPE_BITS> mtype(p.m_type);
71  if (mtype.test(MtypeFlags::SHARED)) {
73  }
74 
75  if (mtype.test(MtypeFlags::READ_WRITE)) {
77  }
78 
79  if (mtype.test(MtypeFlags::CACHED)) {
81  }
82 }
83 
84 const char*
86 {
87  return "DriverWakeupEvent";
88 }
89 
93 int
95 {
96  DPRINTF(GPUDriver, "Opened %s\n", filename);
97  auto process = tc->getProcessPtr();
98  auto device_fd_entry = std::make_shared<DeviceFDEntry>(this, filename);
99  int tgt_fd = process->fds->allocFD(device_fd_entry);
100  return tgt_fd;
101 }
102 
107 Addr
108 GPUComputeDriver::mmap(ThreadContext *tc, Addr start, uint64_t length,
109  int prot, int tgt_flags, int tgt_fd, off_t offset)
110 {
111  auto process = tc->getProcessPtr();
112  auto mem_state = process->memState;
113 
114  Addr pg_off = offset >> PAGE_SHIFT;
115  Addr mmap_type = pg_off & KFD_MMAP_TYPE_MASK;
116  DPRINTF(GPUDriver, "amdkfd mmap (start: %p, length: 0x%x,"
117  "offset: 0x%x)\n", start, length, offset);
118 
119  switch(mmap_type) {
121  DPRINTF(GPUDriver, "amdkfd mmap type DOORBELL offset\n");
122  start = mem_state->extendMmap(length);
123  process->pTable->map(start, device->hsaPacketProc().pioAddr,
124  length, false);
125  break;
127  DPRINTF(GPUDriver, "amdkfd mmap type EVENTS offset\n");
128  panic_if(start != 0,
129  "Start address should be provided by KFD\n");
130  panic_if(length != 8 * KFD_SIGNAL_EVENT_LIMIT,
131  "Requested length %d, expected length %d; length "
132  "mismatch\n", length, 8* KFD_SIGNAL_EVENT_LIMIT);
138  if (!eventPage) {
139  eventPage = mem_state->extendMmap(length);
140  start = eventPage;
141  }
142  break;
143  default:
144  warn_once("Unrecognized kfd mmap type %llx\n", mmap_type);
145  break;
146  }
147 
148  return start;
149 }
150 
158 void
160 {
162  args.copyIn(mem_proxy);
163 
164  if ((doorbellSize() * queueId) > 4096) {
165  fatal("%s: Exceeded maximum number of HSA queues allowed\n", name());
166  }
167 
168  args->doorbell_offset = (KFD_MMAP_TYPE_DOORBELL |
169  KFD_MMAP_GPU_ID(args->gpu_id)) << PAGE_SHIFT;
170 
171  // for vega offset needs to include exact value of doorbell
172  if (doorbellSize())
173  args->doorbell_offset += queueId * doorbellSize();
174 
175  args->queue_id = queueId++;
176  auto &hsa_pp = device->hsaPacketProc();
177  hsa_pp.setDeviceQueueDesc(args->read_pointer_address,
178  args->ring_base_address, args->queue_id,
179  args->ring_size, doorbellSize(), gfxVersion);
180  args.copyOut(mem_proxy);
181 }
182 
183 void
185 {
186  assert(driver);
187  driver->schedule(this, curTick() + wakeup_delay);
188 }
189 
190 void
192 {
193  panic_if(event_id >= eventSlotIndex,
194  "Trying wakeup on an event that is not yet created\n");
195  if (ETable[event_id].threadWaiting) {
196  panic_if(!ETable[event_id].tc,
197  "No thread context to wake up\n");
198  ThreadContext *tc = ETable[event_id].tc;
199  DPRINTF(GPUDriver,
200  "Signal event: Waking up CPU %d\n", tc->cpuId());
201  // Remove events that can wakeup this thread
202  TCEvents[tc].clearEvents();
203  // Now wakeup this thread
204  tc->activate();
205  } else {
206  // This may be a race condition between an ioctl call asking to wait on
207  // this event and this signalWakeupEvent. Taking care of this race
208  // condition here by setting the event here. The ioctl call should take
209  // the necessary action when waiting on an already set event. However,
210  // this may be a genuine instance in which the runtime has decided not
211  // to wait on this event. But since we cannot distinguish this case with
212  // the race condition, we are any way setting the event.
213  ETable[event_id].setEvent = true;
214  }
215 }
216 
217 void
219 {
220  DPRINTF(GPUDriver,
221  "Timer event: Waking up CPU %d\n", tc->cpuId());
222  // Remove events that can wakeup this thread
223  driver->TCEvents[tc].clearEvents();
224  // Now wakeup this thread
225  tc->activate();
226 }
227 
228 int
229 GPUComputeDriver::ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf)
230 {
231  TranslatingPortProxy fs_proxy(tc);
232  SETranslatingPortProxy se_proxy(tc);
233  PortProxy &virt_proxy = FullSystem ? fs_proxy : se_proxy;
234  auto process = tc->getProcessPtr();
235  auto mem_state = process->memState;
236 
237  switch (req) {
239  {
240  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_VERSION\n");
241 
243  args->major_version = KFD_IOCTL_MAJOR_VERSION;
244  args->minor_version = KFD_IOCTL_MINOR_VERSION;
245 
246  args.copyOut(virt_proxy);
247  }
248  break;
250  {
251  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_QUEUE\n");
252 
253  allocateQueue(virt_proxy, ioc_buf);
254 
255  DPRINTF(GPUDriver, "Creating queue %d\n", queueId);
256  }
257  break;
259  {
261  args.copyIn(virt_proxy);
262  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_QUEUE;" \
263  "queue offset %d\n", args->queue_id);
264  device->hsaPacketProc().unsetDeviceQueueDesc(args->queue_id,
265  doorbellSize());
266  }
267  break;
269  {
283  warn("unimplemented ioctl: AMDKFD_IOC_SET_MEMORY_POLICY\n");
284  }
285  break;
287  {
288  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_CLOCK_COUNTERS\n");
289 
291  args.copyIn(virt_proxy);
292 
293  // Set nanosecond resolution
294  args->system_clock_freq = 1000000000;
295 
300  uint64_t elapsed_nsec = curTick() / sim_clock::as_int::ns;
301  args->gpu_clock_counter = elapsed_nsec;
302  args->cpu_clock_counter = elapsed_nsec;
303  args->system_clock_counter = elapsed_nsec;
304 
305  args.copyOut(virt_proxy);
306  }
307  break;
309  {
310  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES\n");
311 
313  args->num_of_nodes = 1;
314 
321  for (int i = 0; i < args->num_of_nodes; ++i) {
329  switch (gfxVersion) {
330  case GfxVersion::gfx801:
331  case GfxVersion::gfx803:
332  args->process_apertures[i].scratch_base =
333  scratchApeBase(i + 1);
334  args->process_apertures[i].lds_base =
335  ldsApeBase(i + 1);
336  break;
337  case GfxVersion::gfx900:
338  case GfxVersion::gfx902:
339  args->process_apertures[i].scratch_base =
341  args->process_apertures[i].lds_base =
342  ldsApeBaseV9();
343  break;
344  default:
345  fatal("Invalid gfx version\n");
346  }
347 
348  // GFX8 and GFX9 set lds and scratch limits the same way
349  args->process_apertures[i].scratch_limit =
350  scratchApeLimit(args->process_apertures[i].scratch_base);
351 
352  args->process_apertures[i].lds_limit =
353  ldsApeLimit(args->process_apertures[i].lds_base);
354 
355  switch (gfxVersion) {
356  case GfxVersion::gfx801:
357  args->process_apertures[i].gpuvm_base =
358  gpuVmApeBase(i + 1);
359  args->process_apertures[i].gpuvm_limit =
360  gpuVmApeLimit(args->process_apertures[i].gpuvm_base);
361  break;
362  case GfxVersion::gfx803:
363  case GfxVersion::gfx900:
364  case GfxVersion::gfx902:
365  // Taken from SVM_USE_BASE in Linux kernel
366  args->process_apertures[i].gpuvm_base = 0x1000000ull;
367  // Taken from AMDGPU_GMC_HOLE_START in Linux kernel
368  args->process_apertures[i].gpuvm_limit =
369  0x0000800000000000ULL - 1;
370  break;
371  default:
372  fatal("Invalid gfx version");
373  }
374 
375  // NOTE: Must match ID populated by hsaTopology.py
376  //
377  // https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/
378  // blob/6a986c0943e9acd8c4c0cf2a9d510ff42167b43f/include/uapi/
379  // linux/kfd_ioctl.h#L564
380  //
381  // The gpu_id is a device identifier used by the driver for
382  // ioctls that allocate arguments. Each device has an unique
383  // id composed out of a non-zero base and an offset.
384  if (isdGPU) {
385  switch (gfxVersion) {
386  case GfxVersion::gfx803:
387  args->process_apertures[i].gpu_id = 50156;
388  break;
389  case GfxVersion::gfx900:
390  args->process_apertures[i].gpu_id = 22124;
391  break;
392  default:
393  fatal("Invalid gfx version for dGPU\n");
394  }
395  } else {
396  switch (gfxVersion) {
397  case GfxVersion::gfx801:
398  case GfxVersion::gfx902:
399  args->process_apertures[i].gpu_id = 2765;
400  break;
401  default:
402  fatal("Invalid gfx version for APU\n");
403  }
404  }
405 
406  DPRINTF(GPUDriver, "GPUVM base for node[%i] = %#x\n", i,
407  args->process_apertures[i].gpuvm_base);
408  DPRINTF(GPUDriver, "GPUVM limit for node[%i] = %#x\n", i,
409  args->process_apertures[i].gpuvm_limit);
410 
411  DPRINTF(GPUDriver, "LDS base for node[%i] = %#x\n", i,
412  args->process_apertures[i].lds_base);
413  DPRINTF(GPUDriver, "LDS limit for node[%i] = %#x\n", i,
414  args->process_apertures[i].lds_limit);
415 
416  DPRINTF(GPUDriver, "Scratch base for node[%i] = %#x\n", i,
417  args->process_apertures[i].scratch_base);
418  DPRINTF(GPUDriver, "Scratch limit for node[%i] = %#x\n", i,
419  args->process_apertures[i].scratch_limit);
420 
427  assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
428  47) != 0x1ffff);
429  assert(bits<Addr>(args->process_apertures[i].scratch_base, 63,
430  47) != 0);
431  assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
432  47) != 0x1ffff);
433  assert(bits<Addr>(args->process_apertures[i].scratch_limit, 63,
434  47) != 0);
435  assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
436  47) != 0x1ffff);
437  assert(bits<Addr>(args->process_apertures[i].lds_base, 63,
438  47) != 0);
439  assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
440  47) != 0x1ffff);
441  assert(bits<Addr>(args->process_apertures[i].lds_limit, 63,
442  47) != 0);
443  }
444 
445  args.copyOut(virt_proxy);
446  }
447  break;
449  {
450  warn("unimplemented ioctl: AMDKFD_IOC_UPDATE_QUEUE\n");
451  }
452  break;
454  {
455  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_CREATE_EVENT\n");
456 
458  args.copyIn(virt_proxy);
459  if (args->event_type != KFD_IOC_EVENT_SIGNAL) {
460  warn("Signal events are only supported currently\n");
461  } else if (eventSlotIndex == SLOTS_PER_PAGE) {
462  fatal("Signal event wasn't created; signal limit reached\n");
463  }
464  // Currently, we allocate only one signal_page for events.
465  // Note that this signal page is of size 8 * KFD_SIGNAL_EVENT_LIMIT
466  uint64_t page_index = 0;
467  args->event_page_offset = (page_index | KFD_MMAP_TYPE_EVENTS);
468  args->event_page_offset <<= PAGE_SHIFT;
469  // TODO: Currently we support only signal events, hence using
470  // the same ID for both signal slot and event slot
471  args->event_slot_index = eventSlotIndex;
472  args->event_id = eventSlotIndex++;
473  args->event_trigger_data = args->event_id;
474  DPRINTF(GPUDriver, "amdkfd create events"
475  "(event_id: 0x%x, offset: 0x%x)\n",
476  args->event_id, args->event_page_offset);
477  // Since eventSlotIndex is increased everytime a new event is
478  // created ETable at eventSlotIndex(event_id) is guaranteed to be
479  // empty. In a future implementation that reuses deleted event_ids,
480  // we should check if event table at this
481  // eventSlotIndex(event_id) is empty before inserting a new event
482  // table entry
483  ETable.emplace(std::pair<uint32_t, ETEntry>(args->event_id, {}));
484  args.copyOut(virt_proxy);
485  }
486  break;
488  {
489  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_DESTROY_EVENT\n");
491  args.copyIn(virt_proxy);
492  DPRINTF(GPUDriver, "amdkfd destroying event %d\n", args->event_id);
493  fatal_if(ETable.count(args->event_id) == 0,
494  "Event ID invalid, cannot destroy this event\n");
495  ETable.erase(args->event_id);
496  }
497  break;
499  {
500  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_SET_EVENTS\n");
502  args.copyIn(virt_proxy);
503  DPRINTF(GPUDriver, "amdkfd set event %d\n", args->event_id);
504  fatal_if(ETable.count(args->event_id) == 0,
505  "Event ID invlaid, cannot set this event\n");
506  ETable[args->event_id].setEvent = true;
507  signalWakeupEvent(args->event_id);
508  }
509  break;
511  {
512  warn("unimplemented ioctl: AMDKFD_IOC_RESET_EVENT\n");
513  }
514  break;
516  {
517  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_WAIT_EVENTS\n");
519  args.copyIn(virt_proxy);
520  kfd_event_data *events =
521  (kfd_event_data *)args->events_ptr;
522  DPRINTF(GPUDriver, "amdkfd wait for events"
523  "(wait on all: %d, timeout : %d, num_events: %s)\n",
524  args->wait_for_all, args->timeout, args->num_events);
525  panic_if(args->wait_for_all != 0 && args->num_events > 1,
526  "Wait for all events not supported\n");
527  bool should_sleep = true;
528  if (TCEvents.count(tc) == 0) {
529  // This thread context trying to wait on an event for the first
530  // time, initialize it.
531  TCEvents.emplace(std::piecewise_construct, std::make_tuple(tc),
532  std::make_tuple(this, tc));
533  DPRINTF(GPUDriver, "\tamdkfd creating event list"
534  " for thread %d\n", tc->cpuId());
535  }
536  panic_if(TCEvents[tc].signalEvents.size() != 0,
537  "There are %d events that put this thread to sleep,"
538  " this thread should not be running\n",
539  TCEvents[tc].signalEvents.size());
540  for (int i = 0; i < args->num_events; i++) {
541  panic_if(!events,
542  "Event pointer invalid\n");
543  Addr eventDataAddr = (Addr)(events + i);
545  eventDataAddr, sizeof(kfd_event_data));
546  EventData.copyIn(virt_proxy);
547  DPRINTF(GPUDriver,
548  "\tamdkfd wait for event %d\n", EventData->event_id);
549  panic_if(ETable.count(EventData->event_id) == 0,
550  "Event ID invalid, cannot set this event\n");
551  if (ETable[EventData->event_id].threadWaiting)
552  warn("Multiple threads waiting on the same event\n");
553  if (ETable[EventData->event_id].setEvent) {
554  // If event is already set, the event has already happened.
555  // Just unset the event and dont put this thread to sleep.
556  ETable[EventData->event_id].setEvent = false;
557  should_sleep = false;
558  }
559  if (should_sleep) {
560  // Put this thread to sleep
561  ETable[EventData->event_id].threadWaiting = true;
562  ETable[EventData->event_id].tc = tc;
563  TCEvents[tc].signalEvents.insert(EventData->event_id);
564  }
565  }
566 
567  // TODO: Return the correct wait_result back. Currently, returning
568  // success for both KFD_WAIT_TIMEOUT and KFD_WAIT_COMPLETE.
569  // Ideally, this needs to be done after the event is triggered and
570  // after the thread is woken up.
571  args->wait_result = 0;
572  args.copyOut(virt_proxy);
573  if (should_sleep) {
574  // Put this thread to sleep
575  sleepCPU(tc, args->timeout);
576  } else {
577  // Remove events that tried to put this thread to sleep
578  TCEvents[tc].clearEvents();
579  }
580  }
581  break;
583  {
584  warn("unimplemented ioctl: AMDKFD_IOC_DBG_REGISTER\n");
585  }
586  break;
588  {
589  warn("unimplemented ioctl: AMDKFD_IOC_DBG_UNREGISTER\n");
590  }
591  break;
593  {
594  warn("unimplemented ioctl: AMDKFD_IOC_DBG_ADDRESS_WATCH\n");
595  }
596  break;
598  {
599  warn("unimplemented ioctl: AMDKFD_IOC_DBG_WAVE_CONTROL\n");
600  }
601  break;
603  {
604  warn("unimplemented ioctl: AMDKFD_IOC_SET_SCRATCH_BACKING_VA\n");
605  }
606  break;
608  {
609  warn("unimplemented ioctl: AMDKFD_IOC_GET_TILE_CONFIG\n");
610  }
611  break;
613  {
614  warn("unimplemented ioctl: AMDKFD_IOC_SET_TRAP_HANDLER\n");
615  }
616  break;
618  {
619  DPRINTF(GPUDriver,
620  "ioctl: AMDKFD_IOC_GET_PROCESS_APERTURES_NEW\n");
621 
623  ioc_args(ioc_buf);
624 
625  ioc_args.copyIn(virt_proxy);
626  ioc_args->num_of_nodes = 1;
627 
628  for (int i = 0; i < ioc_args->num_of_nodes; ++i) {
630  (ioc_args->kfd_process_device_apertures_ptr);
631 
632  switch (gfxVersion) {
633  case GfxVersion::gfx801:
634  case GfxVersion::gfx803:
635  ape_args->scratch_base = scratchApeBase(i + 1);
636  ape_args->lds_base = ldsApeBase(i + 1);
637  break;
638  case GfxVersion::gfx900:
639  case GfxVersion::gfx902:
640  ape_args->scratch_base = scratchApeBaseV9();
641  ape_args->lds_base = ldsApeBaseV9();
642  break;
643  default:
644  fatal("Invalid gfx version\n");
645  }
646 
647  // GFX8 and GFX9 set lds and scratch limits the same way
648  ape_args->scratch_limit =
649  scratchApeLimit(ape_args->scratch_base);
650  ape_args->lds_limit = ldsApeLimit(ape_args->lds_base);
651 
652  switch (gfxVersion) {
653  case GfxVersion::gfx801:
654  ape_args->gpuvm_base = gpuVmApeBase(i + 1);
655  ape_args->gpuvm_limit =
656  gpuVmApeLimit(ape_args->gpuvm_base);
657  break;
658  case GfxVersion::gfx803:
659  case GfxVersion::gfx900:
660  case GfxVersion::gfx902:
661  // Taken from SVM_USE_BASE in Linux kernel
662  ape_args->gpuvm_base = 0x1000000ull;
663  // Taken from AMDGPU_GMC_HOLE_START in Linux kernel
664  ape_args->gpuvm_limit = 0x0000800000000000ULL - 1;
665  break;
666  default:
667  fatal("Invalid gfx version\n");
668  }
669 
670  // NOTE: Must match ID populated by hsaTopology.py
671  if (isdGPU) {
672  switch (gfxVersion) {
673  case GfxVersion::gfx803:
674  ape_args->gpu_id = 50156;
675  break;
676  case GfxVersion::gfx900:
677  ape_args->gpu_id = 22124;
678  break;
679  default:
680  fatal("Invalid gfx version for dGPU\n");
681  }
682  } else {
683  switch (gfxVersion) {
684  case GfxVersion::gfx801:
685  case GfxVersion::gfx902:
686  ape_args->gpu_id = 2765;
687  break;
688  default:
689  fatal("Invalid gfx version for APU\n");
690  }
691  }
692 
693  assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0x1ffff);
694  assert(bits<Addr>(ape_args->scratch_base, 63, 47) != 0);
695  assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0x1ffff);
696  assert(bits<Addr>(ape_args->scratch_limit, 63, 47) != 0);
697  assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0x1ffff);
698  assert(bits<Addr>(ape_args->lds_base, 63, 47) != 0);
699  assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0x1ffff);
700  assert(bits<Addr>(ape_args->lds_limit, 63, 47) != 0);
701 
702  ape_args.copyOut(virt_proxy);
703  }
704 
705  ioc_args.copyOut(virt_proxy);
706  }
707  break;
709  {
710  warn("unimplemented ioctl: AMDKFD_IOC_ACQUIRE_VM\n");
711  }
712  break;
730  {
731  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_ALLOC_MEMORY_OF_GPU\n");
733  args.copyIn(virt_proxy);
734 
735  assert(isdGPU || gfxVersion == GfxVersion::gfx902);
736  assert((args->va_addr % X86ISA::PageBytes) == 0);
737  [[maybe_unused]] Addr mmap_offset = 0;
738 
740  Addr pa_addr = 0;
741 
742  int npages = divCeil(args->size, (int64_t)X86ISA::PageBytes);
743  bool cacheable = true;
744 
745  if (KFD_IOC_ALLOC_MEM_FLAGS_VRAM & args->flags) {
746  DPRINTF(GPUDriver, "amdkfd allocation type: VRAM\n");
747  args->mmap_offset = args->va_addr;
748  // VRAM allocations are device memory mapped into GPUVM
749  // space.
750  //
751  // We can't rely on the lazy host allocator (fixupFault) to
752  // handle this mapping since it needs to be placed in dGPU
753  // framebuffer memory. The lazy allocator will try to place
754  // this in host memory.
755  //
756  // TODO: We don't have the appropriate bifurcation of the
757  // physical address space with different memory controllers
758  // yet. This is where we will explicitly add the PT maps to
759  // dGPU memory in the future.
760  //
761  // Bind the VA space to the dGPU physical memory pool. Mark
762  // this region as Uncacheable. The Uncacheable flag is only
763  // really used by the CPU and is ignored by the GPU. We mark
764  // this as uncacheable from the CPU so that we can implement
765  // direct CPU framebuffer access similar to what we currently
766  // offer in real HW through the so-called Large BAR feature.
767  pa_addr = process->seWorkload->allocPhysPages(
768  npages, dGPUPoolID);
769  //
770  // TODO: Uncacheable accesses need to be supported by the
771  // CPU-side protocol for this to work correctly. I believe
772  // it only works right now if the physical memory is MMIO
773  cacheable = false;
774 
775  DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
776  "%d\n", args->va_addr, pa_addr, args->size);
777 
778  } else if (KFD_IOC_ALLOC_MEM_FLAGS_USERPTR & args->flags) {
779  DPRINTF(GPUDriver, "amdkfd allocation type: USERPTR\n");
780  mmap_offset = args->mmap_offset;
781  // USERPTR allocations are system memory mapped into GPUVM
782  // space. The user provides the driver with the pointer.
783  pa_addr = process->seWorkload->allocPhysPages(npages);
784 
785  DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
786  "%d\n", args->va_addr, pa_addr, args->size);
787 
788  // If the HSA runtime requests system coherent memory, than we
789  // need to explicity mark this region as uncacheable from the
790  // perspective of the GPU.
791  if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
792  mtype.clear();
793 
794  } else if (KFD_IOC_ALLOC_MEM_FLAGS_GTT & args->flags) {
795  DPRINTF(GPUDriver, "amdkfd allocation type: GTT\n");
796  args->mmap_offset = args->va_addr;
797  // GTT allocations are system memory mapped into GPUVM space.
798  // It's different than a USERPTR allocation since the driver
799  // itself allocates the physical memory on the host.
800  //
801  // We will lazily map it into host memory on first touch. The
802  // fixupFault will find the original SVM aperture mapped to the
803  // host.
804  pa_addr = process->seWorkload->allocPhysPages(npages);
805 
806  DPRINTF(GPUDriver, "Mapping VA %p to framebuffer PA %p size "
807  "%d\n", args->va_addr, pa_addr, args->size);
808 
809  // If the HSA runtime requests system coherent memory, than we
810  // need to explicity mark this region as uncacheable from the
811  // perspective of the GPU.
812  if (args->flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
813  mtype.clear();
814 
815  // Note that for GTT the thunk layer needs to call mmap on the
816  // driver FD later if it wants the host to have access to this
817  // memory (which it probably does). This will be ignored.
818  } else if (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL & args->flags) {
819  DPRINTF(GPUDriver, "amdkfd allocation type: DOORBELL\n");
820  // DOORBELL allocations are the queue doorbells that are
821  // memory mapped into GPUVM space.
822  //
823  // Explicitly map this virtual address to our PIO doorbell
824  // interface in the page tables (non-cacheable)
825  pa_addr = device->hsaPacketProc().pioAddr;
826  cacheable = false;
827  }
828 
829  DPRINTF(GPUDriver, "amdkfd allocation arguments: va_addr %p "
830  "size %lu, mmap_offset %p, gpu_id %d\n",
831  args->va_addr, args->size, mmap_offset, args->gpu_id);
832 
833  // Bind selected physical memory to provided virtual address range
834  // in X86 page tables.
835  process->pTable->map(args->va_addr, pa_addr, args->size,
836  cacheable);
837 
838  // We keep track of allocated regions of GPU mapped memory,
839  // just like the driver would. This allows us to provide the
840  // user with a unique handle for a given allocation. The user
841  // will only provide us with a handle after allocation and expect
842  // us to be able to use said handle to extract all the properties
843  // of the region.
844  //
845  // This is a simplified version of regular system VMAs, but for
846  // GPUVM space (none of the clobber/remap nonsense we find in real
847  // OS managed memory).
848  allocateGpuVma(mtype, args->va_addr, args->size);
849 
850  // Used by the runtime to uniquely identify this allocation.
851  // We can just use the starting address of the VMA region.
852  args->handle= args->va_addr;
853  args.copyOut(virt_proxy);
854  }
855  break;
857  {
858  DPRINTF(GPUDriver, "ioctl: AMDKFD_IOC_FREE_MEMORY_OF_GPU\n");
860  args.copyIn(virt_proxy);
861 
862  assert(isdGPU);
863  DPRINTF(GPUDriver, "amdkfd free arguments: handle %p ",
864  args->handle);
865 
866  // We don't recycle physical pages in SE mode
867  Addr size = deallocateGpuVma(args->handle);
868  process->pTable->unmap(args->handle, size);
869 
870  // TODO: IOMMU and GPUTLBs do not seem to correctly support
871  // shootdown. This is also a potential issue for APU systems
872  // that perform unmap or remap with system memory.
873  tc->getMMUPtr()->flushAll();
874 
875  args.copyOut(virt_proxy);
876  }
877  break;
886  {
887  warn("unimplemented ioctl: AMDKFD_IOC_MAP_MEMORY_TO_GPU\n");
888  }
889  break;
891  {
892  warn("unimplemented ioctl: AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU\n");
893  }
894  break;
896  {
897  warn("unimplemented ioctl: AMDKFD_IOC_SET_CU_MASK\n");
898  }
899  break;
901  {
902  warn("unimplemented ioctl: AMDKFD_IOC_GET_QUEUE_WAVE_STATE\n");
903  }
904  break;
906  {
907  warn("unimplemented ioctl: AMDKFD_IOC_GET_DMABUF_INFO\n");
908  }
909  break;
911  {
912  warn("unimplemented ioctl: AMDKFD_IOC_IMPORT_DMABUF\n");
913  }
914  break;
916  {
917  warn("unimplemented ioctl: AMDKFD_IOC_ALLOC_QUEUE_GWS\n");
918  }
919  break;
921  {
922  warn("unimplemented ioctl: AMDKFD_IOC_SMI_EVENTS\n");
923  }
924  break;
925  default:
926  fatal("%s: bad ioctl %d\n", req);
927  break;
928  }
929  return 0;
930 }
931 
932 void
933 GPUComputeDriver::sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
934 {
935  // Convert millisecs to ticks
936  Tick wakeup_delay((uint64_t)milliSecTimeout * 1000000000);
937  assert(TCEvents.count(tc) == 1);
938  TCEvents[tc].timerEvent.scheduleWakeup(wakeup_delay);
939  tc->suspend();
940  DPRINTF(GPUDriver,
941  "CPU %d is put to sleep\n", tc->cpuId());
942 }
943 
944 Addr
946 {
947  return ((Addr)gpuNum << 61) + 0x1000000000000L;
948 }
949 
950 Addr
952 {
953  return (apeBase & 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL;
954 }
955 
956 Addr
958 {
959  return ((Addr)gpuNum << 61) + 0x100000000L;
960 }
961 
962 // Used for GFX9 devices
963 // From drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c in the Linux kernel
964 Addr
966 {
967  return ((Addr)0x1 << 48);
968 }
969 
970 Addr
972 {
973  return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
974 }
975 
976 Addr
978 {
979  return ((Addr)gpuNum << 61) + 0x0;
980 }
981 
982 //Used for GFX9 devices
983 // From drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c in the Linux kernel
984 Addr
986 {
987  return ((Addr)0x2 << 48);
988 }
989 
990 Addr
992 {
993  return (apeBase & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF;
994 }
995 
996 void
998  Addr start, Addr length)
999 {
1000  AddrRange range = AddrRange(start, start + length);
1001  DPRINTF(GPUDriver, "Registering [%p - %p] with MTYPE %d\n",
1002  range.start(), range.end(), mtype);
1003  fatal_if(gpuVmas.insert(range, mtype) == gpuVmas.end(),
1004  "Attempted to double register Mtypes for [%p - %p]\n",
1005  range.start(), range.end());
1006 }
1007 
1008 Addr
1010 {
1011  auto vma = gpuVmas.contains(start);
1012  assert(vma != gpuVmas.end());
1013  assert((vma->first.start() == start));
1014  Addr size = vma->first.size();
1015  DPRINTF(GPUDriver, "Unregistering [%p - %p]\n", vma->first.start(),
1016  vma->first.end());
1017  gpuVmas.erase(vma);
1018  return size;
1019 }
1020 
1021 void
1023 {
1024  // If we are a dGPU then set the MTYPE from our VMAs.
1025  if (isdGPU) {
1026  assert(!FullSystem);
1027  AddrRange range = RangeSize(req->getVaddr(), req->getSize());
1028  auto vma = gpuVmas.contains(range);
1029  assert(vma != gpuVmas.end());
1030  DPRINTF(GPUShader, "Setting req from [%p - %p] MTYPE %d\n"
1031  "%d\n", range.start(), range.end(), vma->second);
1032  req->setCacheCoherenceFlags(vma->second);
1033  // APUs always get the default MTYPE
1034  } else {
1035  req->setCacheCoherenceFlags(defaultMtype);
1036  }
1037 }
1038 
1039 } // namespace gem5
#define DPRINTF(x,...)
Definition: trace.hh:186
The AddrRange class encapsulates an address range, and supports a number of tests to check if two ran...
Definition: addr_range.hh:82
bool copyIn(const PortProxy &memproxy)
copy data into simulator space (read from target memory)
const int size
buffer size
bool copyOut(const PortProxy &memproxy)
copy data out of simulator space (write to target memory)
virtual void flushAll()
Definition: mmu.cc:81
EmulatedDriver is an abstract base class for fake SE-mode device drivers.
Definition: emul_driver.hh:56
const std::string & filename
filename for opening this driver (under /dev)
Definition: emul_driver.hh:61
HSAPacketProcessor & hsaPacketProc()
void attachDriver(GPUComputeDriver *driver)
const char * description() const override
Return a C string describing the event.
void allocateGpuVma(Request::CacheCoherenceFlags mtype, Addr start, Addr length)
Allocate/deallocate GPUVM VMAs for tracking virtual address allocations and properties on DGPUs.
void setMtype(RequestPtr req)
Called by the compute units right before a request is issued to ruby.
virtual void signalWakeupEvent(uint32_t event_id)
int open(ThreadContext *tc, int mode, int flags) override
Create an FD entry for the KFD inside of the owning process.
int ioctl(ThreadContext *tc, unsigned req, Addr ioc_buf) override
Abstract method, invoked when the user program calls ioctl() on the file descriptor returned by a pre...
void sleepCPU(ThreadContext *tc, uint32_t milliSecTimeout)
Addr scratchApeLimit(Addr apeBase) const
GPUComputeDriver(const Params &p)
Addr deallocateGpuVma(Addr start)
Addr scratchApeBase(int gpuNum) const
std::unordered_map< ThreadContext *, EventList > TCEvents
Addr gpuVmApeBase(int gpuNum) const
The aperture (APE) base/limit pairs are set statically at startup by the real KFD.
void allocateQueue(PortProxy &mem_proxy, Addr ioc_buf_addr)
Forward relevant parameters to packet processor; queueId is used to link doorbell.
Request::CacheCoherenceFlags defaultMtype
GPUComputeDriverParams Params
std::unordered_map< uint32_t, ETEntry > ETable
Addr mmap(ThreadContext *tc, Addr start, uint64_t length, int prot, int tgt_flags, int tgt_fd, off_t offset) override
Currently, mmap() will simply setup a mapping for the associated device's packet processor's doorbell...
GPUCommandProcessor * device
GPU that is controlled by this driver.
AddrRangeMap< Request::CacheCoherenceFlags, 1 > gpuVmas
VMA structures for GPUVM memory.
Addr ldsApeBase(int gpuNum) const
Addr ldsApeLimit(Addr apeBase) const
Addr gpuVmApeLimit(Addr apeBase) const
void setDeviceQueueDesc(uint64_t hostReadIndexPointer, uint64_t basePointer, uint64_t queue_id, uint32_t size, int doorbellSize, GfxVersion gfxVersion, Addr offset=0, uint64_t rd_idx=0)
void unsetDeviceQueueDesc(uint64_t queue_id, int doorbellSize)
virtual std::string name() const
Definition: named.hh:47
This object is a proxy for a port or other object which implements the functional response protocol,...
Definition: port_proxy.hh:87
std::shared_ptr< MemState > memState
Definition: process.hh:290
@ CACHED
mtype flags
Definition: request.hh:336
ThreadContext is the external interface to all thread state for anything outside of the CPU.
virtual void suspend()=0
Set the status to Suspended.
virtual Process * getProcessPtr()=0
virtual BaseMMU * getMMUPtr()=0
virtual int cpuId() const =0
This proxy attempts to translate virtual addresses using the TLBs.
TypedBufferArg is a class template; instances of this template represent typed buffers in target user...
STL pair class.
Definition: stl.hh:58
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
The GPUComputeDriver implements an HSADriver for an HSA AMD GPU agent.
AddrRange RangeSize(Addr start, Addr size)
Definition: addr_range.hh:815
Addr end() const
Get the end address of the range.
Definition: addr_range.hh:350
Addr start() const
Get the start address of the range.
Definition: addr_range.hh:343
static constexpr T divCeil(const T &a, const U &b)
Definition: intmath.hh:110
void set(Type mask)
Set all flag's bits matching the given mask.
Definition: flags.hh:116
void clear()
Clear all flag's bits.
Definition: flags.hh:102
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:226
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:190
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition: logging.hh:204
uint8_t flags
Definition: helpers.cc:66
#define KFD_MMAP_TYPE_DOORBELL
#define SLOTS_PER_PAGE
#define KFD_MMAP_TYPE_MASK
#define KFD_MMAP_TYPE_EVENTS
#define PAGE_SHIFT
#define KFD_MMAP_GPU_ID(gpu_id)
#define AMDKFD_IOC_RESET_EVENT
Definition: kfd_ioctl.h:552
#define AMDKFD_IOC_GET_CLOCK_COUNTERS
Definition: kfd_ioctl.h:534
#define AMDKFD_IOC_GET_DMABUF_INFO
Definition: kfd_ioctl.h:604
#define AMDKFD_IOC_IMPORT_DMABUF
Definition: kfd_ioctl.h:607
#define KFD_IOCTL_MAJOR_VERSION
Definition: kfd_ioctl.h:37
#define AMDKFD_IOC_SET_MEMORY_POLICY
Definition: kfd_ioctl.h:531
#define AMDKFD_IOC_GET_VERSION
Definition: kfd_ioctl.h:522
#define AMDKFD_IOC_DESTROY_EVENT
Definition: kfd_ioctl.h:546
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL
Definition: kfd_ioctl.h:378
#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA
Definition: kfd_ioctl.h:570
#define KFD_SIGNAL_EVENT_LIMIT
Definition: kfd_ioctl.h:228
#define AMDKFD_IOC_DBG_REGISTER
Definition: kfd_ioctl.h:558
#define AMDKFD_IOC_ACQUIRE_VM
Definition: kfd_ioctl.h:583
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT
Definition: kfd_ioctl.h:386
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW
Definition: kfd_ioctl.h:579
#define KFD_IOC_EVENT_SIGNAL
Definition: kfd_ioctl.h:214
#define AMDKFD_IOC_CREATE_EVENT
Definition: kfd_ioctl.h:543
#define AMDKFD_IOC_WAIT_EVENTS
Definition: kfd_ioctl.h:555
#define AMDKFD_IOC_DESTROY_QUEUE
Definition: kfd_ioctl.h:528
#define AMDKFD_IOC_SMI_EVENTS
Definition: kfd_ioctl.h:613
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR
Definition: kfd_ioctl.h:377
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU
Definition: kfd_ioctl.h:586
#define AMDKFD_IOC_GET_TILE_CONFIG
Definition: kfd_ioctl.h:573
#define AMDKFD_IOC_SET_EVENT
Definition: kfd_ioctl.h:549
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU
Definition: kfd_ioctl.h:592
#define AMDKFD_IOC_DBG_UNREGISTER
Definition: kfd_ioctl.h:561
#define AMDKFD_IOC_SET_CU_MASK
Definition: kfd_ioctl.h:598
#define AMDKFD_IOC_CREATE_QUEUE
Definition: kfd_ioctl.h:525
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU
Definition: kfd_ioctl.h:589
#define KFD_IOCTL_MINOR_VERSION
Definition: kfd_ioctl.h:38
#define AMDKFD_IOC_GET_PROCESS_APERTURES
Definition: kfd_ioctl.h:537
#define AMDKFD_IOC_DBG_WAVE_CONTROL
Definition: kfd_ioctl.h:567
#define AMDKFD_IOC_UPDATE_QUEUE
Definition: kfd_ioctl.h:540
#define AMDKFD_IOC_DBG_ADDRESS_WATCH
Definition: kfd_ioctl.h:564
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT
Definition: kfd_ioctl.h:376
#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE
Definition: kfd_ioctl.h:601
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM
Definition: kfd_ioctl.h:375
#define AMDKFD_IOC_ALLOC_QUEUE_GWS
Definition: kfd_ioctl.h:610
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU
Definition: kfd_ioctl.h:595
#define AMDKFD_IOC_SET_TRAP_HANDLER
Definition: kfd_ioctl.h:576
#define warn(...)
Definition: logging.hh:246
#define warn_once(...)
Definition: logging.hh:250
Bitfield< 4, 0 > mode
Definition: misc_types.hh:74
Bitfield< 7 > i
Definition: misc_types.hh:67
Bitfield< 23, 0 > offset
Definition: types.hh:144
Bitfield< 54 > p
Definition: pagetable.hh:70
Bitfield< 7 > prot
Definition: misc.hh:587
const Addr PageBytes
Definition: page_size.hh:49
Tick ns
nanosecond
Definition: core.cc:71
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
Definition: request.hh:92
Tick curTick()
The universal simulation clock.
Definition: cur_tick.hh:46
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:147
bool FullSystem
The FullSystem variable can be used to determine the current mode of simulation.
Definition: root.cc:220
uint64_t Tick
Tick count type.
Definition: types.hh:58
PortProxy Object Declaration.
This file defines buffer classes used to handle pointer arguments in emulated syscalls.

Generated on Wed Dec 21 2022 10:22:35 for gem5 by doxygen 1.9.1