gem5 [DEVELOP-FOR-25.1]
Loading...
Searching...
No Matches
pm4_packet_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2021 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
34
35#include "debug/PM4PacketProcessor.hh"
42#include "enums/GfxVersion.hh"
44#include "gpu-compute/shader.hh"
45#include "mem/packet.hh"
46#include "mem/packet_access.hh"
47
48namespace gem5
49{
50
51PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
52 : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
53{
54 memset(&kiq, 0, sizeof(QueueDesc));
55 memset(&pq, 0, sizeof(QueueDesc));
56}
57
65{
66 if (gpuDevice->getVM().inAGP(vaddr)) {
67 // Use AGP translation gen
68 return TranslationGenPtr(
69 new AMDGPUVM::AGPTranslationGen(&gpuDevice->getVM(), vaddr, size));
70 }
71
72 // Assume GART otherwise as this is the only other translation aperture
73 // available to the PM4 packet processor.
74 return TranslationGenPtr(
75 new AMDGPUVM::GARTTranslationGen(&gpuDevice->getVM(), vaddr, size));
76}
77
80{
81 AddrRangeList ranges;
82 return ranges;
83}
84
85void
87{
88 gpuDevice = gpu_device;
89}
90
91Addr
93{
94 if (!gpuDevice->getVM().inAGP(addr)) {
95 Addr low_bits = bits(addr, 11, 0);
96 addr = (((addr >> 12) << 3) << 12) | low_bits;
97 }
98 return addr;
99}
100
101PM4Queue *
103{
104 auto result = queuesMap.find(offset);
105 if (result == queuesMap.end()) {
106 if (gfx)
107 mapPq(offset);
108 else
109 mapKiq(offset);
110 return queuesMap[offset];
111 }
112 return result->second;
113}
114
115void
121
122void
128
129void
131 PM4MapQueues *pkt, int id)
132{
133 if (id == -1)
134 id = queues.size();
135
136 /* 256 bytes aligned address */
137 mqd->base <<= 8;
138 PM4Queue *q = new PM4Queue(id, mqd, offset, pkt);
139
140 queuesMap[offset] = q;
141 queues[id] = q;
142
143 /* we are assumming only compute queues can be map from MQDs */
144 QueueType qt;
145 qt = mqd->aql ? QueueType::ComputeAQL
147 gpuDevice->setDoorbellType(offset, qt, getIpId());
148
149 DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
150 "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
151 q->me(), q->pipe(), q->queue(), q->size());
152}
153
154void
156{
157 q->wptr(wptrOffset * sizeof(uint32_t));
158
159 if (!q->processing()) {
160 q->processing(true);
161 decodeNext(q);
162 }
163}
164
165void
167{
168 DPRINTF(PM4PacketProcessor, "PM4 decode queue %d rptr %p, wptr %p\n",
169 q->id(), q->rptr(), q->wptr());
170
171 if (q->rptr() != q->wptr()) {
172 /* Additional braces here are needed due to a clang compilation bug
173 falsely throwing a "suggest braces around initialization of
174 subject" error. More info on this bug is available here:
175 https://stackoverflow.com/questions/31555584
176 */
177 PM4Header h{{{0, 0, 0, 0, 0, 0}}};
178 auto cb = new DmaVirtCallback<PM4Header>(
179 [ = ] (PM4Header header)
180 { decodeHeader(q, header); }, h);
181 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(uint32_t), cb,
182 &cb->dmaBuffer);
183 } else {
184 // Reached the end of processable data in the queue. Switch out of IB
185 // if this is an indirect buffer.
186 assert(q->rptr() == q->wptr());
187 q->processing(false);
188 if (q->ib()) {
189 q->ib(false);
190 decodeNext(q);
191 }
192
193 // Write back rptr when the queue is empty. For static queues which
194 // are not unmapped, this is how the driver knows there is enough
195 // space in the queue to continue writing packets to the ring buffer.
196 if (q->getMQD()->aqlRptr) {
197 Addr addr = getGARTAddr(q->getMQD()->aqlRptr);
198 uint32_t *data = new uint32_t;
199 // gem5 stores rptr as a bytes offset while the driver expects
200 // a dword offset. Convert the offset to dword count.
201 *data = q->getRptr() >> 2;
202 auto cb = new DmaVirtCallback<uint32_t>(
203 [data](const uint32_t &) { delete data; });
204 dmaWriteVirt(addr, sizeof(uint32_t), cb, data);
205 }
206 }
207}
208
209void
211{
212 DPRINTF(PM4PacketProcessor, "PM4 packet %p\n", header.opcode);
213
214 q->incRptr(sizeof(PM4Header));
215
216 DmaVirtCallback<uint64_t> *cb = nullptr;
217 void *dmaBuffer = nullptr;
218
219 switch(header.opcode) {
220 case IT_NOP: {
221 DPRINTF(PM4PacketProcessor, "PM4 nop, count %p\n", header.count);
222 DPRINTF(PM4PacketProcessor, "rptr %p wptr %p\n", q->rptr(), q->wptr());
223 if (header.count != 0x3fff) {
224 q->incRptr((header.count + 1) * sizeof(uint32_t));
225 }
226 decodeNext(q);
227 } break;
228 case IT_WRITE_DATA: {
229 dmaBuffer = new PM4WriteData();
230 DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
231 header.ordinal, header.count);
233 [ = ] (const uint64_t &)
234 { writeData(q, (PM4WriteData *)dmaBuffer, header); });
235 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
236 dmaBuffer);
237 } break;
238
239 case IT_MAP_QUEUES: {
240 dmaBuffer = new PM4MapQueues();
242 [ = ] (const uint64_t &)
243 { mapQueues(q, (PM4MapQueues *)dmaBuffer); });
244 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapQueues), cb,
245 dmaBuffer);
246 } break;
247
248 case IT_RELEASE_MEM: {
249 dmaBuffer = new PM4ReleaseMem();
251 [ = ] (const uint64_t &)
252 { releaseMem(q, (PM4ReleaseMem *)dmaBuffer); });
253 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4ReleaseMem), cb,
254 dmaBuffer);
255 } break;
256
257 case IT_INDIRECT_BUFFER: {
258 dmaBuffer = new PM4IndirectBuf();
260 [ = ] (const uint64_t &)
261 { indirectBuffer(q, (PM4IndirectBuf *)dmaBuffer); });
262 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4IndirectBuf), cb,
263 dmaBuffer);
264 } break;
265
266 case IT_SWITCH_BUFFER: {
267 dmaBuffer = new PM4SwitchBuf();
269 [ = ] (const uint64_t &)
270 { switchBuffer(q, (PM4SwitchBuf *)dmaBuffer); });
271 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4SwitchBuf), cb,
272 dmaBuffer);
273 } break;
274
275 case IT_SET_UCONFIG_REG: {
276 dmaBuffer = new PM4SetUconfigReg();
278 [ = ] (const uint64_t &)
279 { setUconfigReg(q, (PM4SetUconfigReg *)dmaBuffer); });
280 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4SetUconfigReg), cb,
281 dmaBuffer);
282 } break;
283
284 case IT_WAIT_REG_MEM: {
285 dmaBuffer = new PM4WaitRegMem();
287 [ = ] (const uint64_t &)
288 { waitRegMem(q, (PM4WaitRegMem *)dmaBuffer); });
289 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WaitRegMem), cb,
290 dmaBuffer);
291 } break;
292 case IT_MAP_PROCESS: {
293 if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
294 gpuDevice->getGfxVersion() == GfxVersion::gfx942 ||
295 gpuDevice->getGfxVersion() == GfxVersion::gfx950) {
296 dmaBuffer = new PM4MapProcessV2();
297 cb = new DmaVirtCallback<uint64_t>([=](const uint64_t &) {
298 mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer);
299 });
300 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2), cb,
301 dmaBuffer);
302 } else {
303 dmaBuffer = new PM4MapProcess();
304 cb = new DmaVirtCallback<uint64_t>([=](const uint64_t &) {
305 mapProcessV1(q, (PM4MapProcess *)dmaBuffer);
306 });
307 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
308 dmaBuffer);
309 }
310 } break;
311
312 case IT_UNMAP_QUEUES: {
313 dmaBuffer = new PM4UnmapQueues();
315 [ = ] (const uint64_t &)
316 { unmapQueues(q, (PM4UnmapQueues *)dmaBuffer); });
317 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4UnmapQueues), cb,
318 dmaBuffer);
319 } break;
320
321 case IT_RUN_LIST: {
322 dmaBuffer = new PM4RunList();
324 [ = ] (const uint64_t &)
325 { runList(q, (PM4RunList *)dmaBuffer); });
326 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4RunList), cb,
327 dmaBuffer);
328 } break;
329
330 case IT_QUERY_STATUS: {
331 dmaBuffer = new PM4QueryStatus();
333 [ = ] (const uint64_t &)
334 { queryStatus(q, (PM4QueryStatus *)dmaBuffer); });
335 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4QueryStatus), cb,
336 dmaBuffer);
337 } break;
338
339 case IT_INVALIDATE_TLBS: {
340 DPRINTF(PM4PacketProcessor, "Functionaly invalidating all TLBs\n");
341 gpuDevice->getVM().invalidateTLBs();
342 q->incRptr((header.count + 1) * sizeof(uint32_t));
343 decodeNext(q);
344 } break;
345
346 default: {
347 warn("PM4 packet opcode 0x%x not supported.\n", header.opcode);
348 DPRINTF(PM4PacketProcessor, "PM4 packet opcode 0x%x not supported.\n",
349 header.opcode);
350 q->incRptr((header.count + 1) * sizeof(uint32_t));
351 decodeNext(q);
352 } break;
353 }
354}
355
356void
358{
359 q->incRptr(sizeof(PM4WriteData));
360
361 DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
362 "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
363 pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
364 pkt->resume, pkt->writeConfirm, pkt->cachePolicy);
365
366 if (pkt->destSel == 5) {
367 // Memory address destination
369
370 // This is a variable length packet. The size of the packet is in
371 // the header.count field and is set as Number Of Dwords - 1. This
372 // packet is 4 bytes minuimum meaning the count is minimum 3. To
373 // get the number of dwords of data subtract two from the count.
374 unsigned size = (header.count - 2) * sizeof(uint32_t);
375
376 DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
377 auto cb = new DmaVirtCallback<uint32_t>(
378 [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
379 dmaWriteVirt(addr, size, cb, &pkt->data);
380
381 if (!pkt->writeConfirm) {
382 decodeNext(q);
383 }
384 } else if (pkt->destSel == 0) {
385 // Register dword address destination
386 Addr byte_addr = pkt->destAddr << 2;
387
388 gpuDevice->setRegVal(byte_addr, pkt->data);
389
390 // setRegVal is instant on the simulated device so we ignore write
391 // confirm.
392 delete pkt;
393 decodeNext(q);
394 } else {
395 fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
396 }
397}
398
399void
401{
402 DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
403 pkt->data);
404
405 if (pkt->writeConfirm) {
406 decodeNext(q);
407 }
408
409 delete pkt;
410}
411
412void
414{
415 q->incRptr(sizeof(PM4MapQueues));
416
417 DPRINTF(PM4PacketProcessor, "MAPQueues queueSel: %d, vmid: %d, me: %d, "
418 "pipe: %d, queueSlot: %d, queueType: %d, allocFormat: %d, "
419 "engineSel: %d, numQueues: %d, checkDisable: %d, doorbellOffset:"
420 " %d, mqdAddr: %lx, wptrAddr: %lx\n", pkt->queueSel, pkt->vmid,
421 pkt->me, pkt->pipe, pkt->queueSlot, pkt->queueType,
422 pkt->allocFormat, pkt->engineSel, pkt->numQueues,
423 pkt->checkDisable, pkt->doorbellOffset, pkt->mqdAddr,
424 pkt->wptrAddr);
425
426 // Partially reading the mqd with an offset of 96 dwords
427 if (pkt->engineSel == 0 || pkt->engineSel == 1 || pkt->engineSel == 4) {
428 Addr addr = getGARTAddr(pkt->mqdAddr + 96 * sizeof(uint32_t));
429
431 "Mapping mqd from %p %p (vmid %d - last vmid %d).\n",
432 addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID());
433
434 // The doorbellOffset is a dword address. We shift by two / multiply
435 // by four to get the byte address to match doorbell addresses in
436 // the GPU device.
437 gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2,
438 gpuDevice->lastVMID());
439
440 QueueDesc *mqd = new QueueDesc();
441 memset(mqd, 0, sizeof(QueueDesc));
442 auto cb = new DmaVirtCallback<uint32_t>(
443 [ = ] (const uint32_t &) {
444 processMQD(pkt, q, addr, mqd, gpuDevice->lastVMID()); });
445 dmaReadVirt(addr, sizeof(QueueDesc), cb, mqd);
446 } else if (pkt->engineSel == 2 || pkt->engineSel == 3) {
447 SDMAQueueDesc *sdmaMQD = new SDMAQueueDesc();
448 memset(sdmaMQD, 0, sizeof(SDMAQueueDesc));
449
450 // For SDMA we read the full MQD, so there is no offset calculation.
452
453 auto cb = new DmaVirtCallback<uint32_t>(
454 [ = ] (const uint32_t &) {
455 processSDMAMQD(pkt, q, addr, sdmaMQD,
456 gpuDevice->lastVMID()); });
457 dmaReadVirt(addr, sizeof(SDMAQueueDesc), cb, sdmaMQD);
458 } else {
459 panic("Unknown engine for MQD: %d\n", pkt->engineSel);
460 }
461}
462
463void
465 QueueDesc *mqd, uint16_t vmid)
466{
467 DPRINTF(PM4PacketProcessor, "MQDbase: %lx, active: %d, vmid: %d, base: "
468 "%lx, rptr: %x aqlPtr: %lx\n", mqd->mqdBase, mqd->hqd_active,
469 mqd->hqd_vmid, mqd->base, mqd->rptr, mqd->aqlRptr);
470
471 Addr offset = mqd->doorbell & 0x1ffffffc;
472 newQueue(mqd, offset, pkt);
473 PM4Queue *new_q = queuesMap[offset];
474 gpuDevice->insertQId(vmid, new_q->id());
475
476 if (mqd->aql) {
477 // The queue size is encoded in the cp_hqd_pq_control field in the
478 // kernel driver in the 6 lowest bits as log2(queue_size / 4) - 1
479 // number of dwords.
480 //
481 // https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/
482 // roc-4.3.x/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c#L3561
483 //
484 // Queue size is then 2^(cp_hqd_pq_control[5:0] + 1) dword. Multiply
485 // by 4 to get the number of bytes as HSAPP expects.
486 int mqd_size = (1 << ((mqd->hqd_pq_control & 0x3f) + 1)) * 4;
487 auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
488 hsa_pp.setDeviceQueueDesc(mqd->aqlRptr, mqd->base, new_q->id(),
489 mqd_size, 8, GfxVersion::gfx900, offset,
490 mqd->mqdReadIndex);
491 }
492
493 DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, "
494 "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
495
496 gpuDevice->processPendingDoorbells(offset);
497
498 delete pkt;
499 decodeNext(q);
500}
501
502void
504 SDMAQueueDesc *mqd, uint16_t vmid)
505{
506 uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1);
507 Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi;
508 rptr_wb_addr <<= 32;
509 rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo;
510
511 DPRINTF(PM4PacketProcessor, "SDMAMQD: rb base: %#lx rptr: %#x/%#x wptr: "
512 "%#x/%#x ib: %#x/%#x size: %d ctrl: %#x rptr wb addr: %#lx\n",
516 rlc_size, mqd->sdmax_rlcx_rb_cntl, rptr_wb_addr);
517
518 // Engine 2 points to SDMA0 while engine 3 points to SDMA1
519 assert(pkt->engineSel == 2 || pkt->engineSel == 3);
520 SDMAEngine *sdma_eng = gpuDevice->getSDMAById(pkt->engineSel - 2);
521
522 // Queue type 1 and 2 are "static" queues
523 bool is_static = (pkt->queueType == 2) || (pkt->queueType == 3);
524
525 // Register RLC queue with SDMA
526 sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, addr, mqd, is_static);
527
528 // Register doorbell with GPU device
529 gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
530 gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
531
532 gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
533
534 delete pkt;
535 decodeNext(q);
536}
537
538void
540{
541 q->incRptr(sizeof(PM4ReleaseMem));
542
543 Addr addr = getGARTAddr(pkt->addr);
544 DPRINTF(PM4PacketProcessor, "PM4 release_mem event %d eventIdx %d intSel "
545 "%d destSel %d dataSel %d, address %p data %p, intCtx %p\n",
546 pkt->event, pkt->eventIdx, pkt->intSelect, pkt->destSelect,
547 pkt->dataSelect, addr, pkt->dataLo, pkt->intCtxId);
548
550 "PM4 release_mem destSel 0 bypasses caches to MC.\n");
551
552 if (pkt->dataSelect == 1) {
553 auto cb = new DmaVirtCallback<uint32_t>(
554 [ = ](const uint32_t &) { releaseMemDone(q, pkt, addr); },
555 pkt->dataLo);
556 dmaWriteVirt(addr, sizeof(uint32_t), cb, &cb->dmaBuffer);
557 } else {
558 panic("Unimplemented PM4ReleaseMem.dataSelect");
559 }
560}
561
562void
564{
565 DPRINTF(PM4PacketProcessor, "PM4 release_mem wrote %d to %p\n",
566 pkt->dataLo, addr);
567 if (pkt->intSelect == 2) {
568 DPRINTF(PM4PacketProcessor, "PM4 interrupt, id: %d ctx: %d, me: %d, "
569 "pipe: %d, queueSlot:%d\n", q->id(), pkt->intCtxId, q->me(),
570 q->pipe(), q->queue());
571
572 uint8_t ringId = 0;
573 if (q->id() != 0) {
574 ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
575 }
576 gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
578 2 * getIpId());
579 gpuDevice->getIH()->submitInterruptCookie();
580 }
581
582 delete pkt;
583 decodeNext(q);
584}
585
586void
588{
589 assert(queuesMap.count(offset));
590 queuesMap[offset]->getMQD()->mqdReadIndex = rd_idx;
591}
592
593void
595{
596 auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
597 for (auto iter : gpuDevice->getUsedVMIDs()) {
598 for (auto id : iter.second) {
599 assert(queues.count(id));
600
601 // Do not unmap KMD queues.
602 if (queues[id]->privileged()) {
603 continue;
604 }
605
606 // Do not unmap static queues if requested.
607 if (!unmap_static && queues[id]->isStatic()) {
608 continue;
609 }
610
611 QueueDesc *mqd = queues[id]->getMQD();
612 DPRINTF(PM4PacketProcessor, "Unmapping queue %d with read "
613 "index %ld\n", id, mqd->mqdReadIndex);
614
615 // Partially writing the mqd with an offset of 96 dwords as gem5
616 // does not use the full MQD and begins 96 dwords from the start
617 // of the full MQD structure. See src/dev/amdgpu/pm4_queues.hh.
618 Addr addr = getGARTAddr(queues[id]->mqdBase() +
619 96 * sizeof(uint32_t));
620 Addr mqd_base = queues[id]->mqdBase();
621 auto cb = new DmaVirtCallback<uint32_t>(
622 [ = ] (const uint32_t &) {
623 doneMQDWrite(mqd_base, addr);
624 });
625 mqd->base >>= 8;
626 dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd);
627 queues.erase(id);
628 hsa_pp.unsetDeviceQueueDesc(id, 8);
629 delete mqd;
630 }
631 }
632}
633
634void
636{
637 q->incRptr(sizeof(PM4UnmapQueues));
638
639 DPRINTF(PM4PacketProcessor, "PM4 unmap_queues queueSel: %d numQueues: %d "
640 "pasid: %p doorbellOffset0 %p \n",
641 pkt->queueSel, pkt->numQueues, pkt->pasid, pkt->doorbellOffset0);
642
643 switch (pkt->queueSel) {
644 case 0:
645 switch (pkt->numQueues) {
646 case 1:
647 gpuDevice->deallocateVmid(
648 gpuDevice->getVMID(pkt->doorbellOffset0));
649 gpuDevice->deallocateVmid(
650 gpuDevice->getVMID(pkt->doorbellOffset1));
651 gpuDevice->deallocateVmid(
652 gpuDevice->getVMID(pkt->doorbellOffset2));
653 gpuDevice->deallocateVmid(
654 gpuDevice->getVMID(pkt->doorbellOffset3));
655 break;
656 case 2:
657 gpuDevice->deallocateVmid(
658 gpuDevice->getVMID(pkt->doorbellOffset1));
659 gpuDevice->deallocateVmid(
660 gpuDevice->getVMID(pkt->doorbellOffset2));
661 gpuDevice->deallocateVmid(
662 gpuDevice->getVMID(pkt->doorbellOffset3));
663 break;
664 case 3:
665 gpuDevice->deallocateVmid(
666 gpuDevice->getVMID(pkt->doorbellOffset2));
667 gpuDevice->deallocateVmid(
668 gpuDevice->getVMID(pkt->doorbellOffset3));
669 break;
670 case 4:
671 gpuDevice->deallocateVmid(
672 gpuDevice->getVMID(pkt->doorbellOffset3));
673 break;
674 default:
675 panic("Unrecognized number of queues %d\n", pkt->numQueues);
676 }
677 break;
678 case 1:
679 gpuDevice->deallocatePasid(pkt->pasid);
680 break;
681 case 2:
682 unmapAllQueues(true);
683 gpuDevice->deallocateAllQueues(true);
684 break;
685 case 3:
686 unmapAllQueues(false);
687 gpuDevice->deallocateAllQueues(false);
688 break;
689 default:
690 panic("Unrecognized options\n");
691 break;
692 }
693
694 delete pkt;
695 decodeNext(q);
696}
697
698void
700 DPRINTF(PM4PacketProcessor, "PM4 unmap_queues MQD %p wrote to addr %p\n",
701 mqdAddr, addr);
702}
703
704void
705PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
706 uint32_t shMemBases)
707{
708 uint16_t vmid = gpuDevice->allocateVMID(pasid);
709
710 gpuDevice->getVM().setPageTableBase(vmid, ptBase);
711 gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases);
712
713 // Setup the apertures that gem5 uses. These values are bits [63:48].
714 Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48;
715 Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48;
716
717 // There does not seem to be any register for the limit, but the driver
718 // assumes scratch and LDS have a 4GB aperture, so use that.
719 gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF);
720 gpuDevice->CP()->shader()->setScratchApe(scratch_base,
721 scratch_base + 0xFFFFFFFF);
722}
723
724void
726{
727 q->incRptr(sizeof(PM4MapProcess));
728
729 DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
730 "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
731 pkt->ptBase, pkt->completionSignal);
732
733 mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
734
735 delete pkt;
736 decodeNext(q);
737}
738
739void
741{
742 q->incRptr(sizeof(PM4MapProcessV2));
743
744 DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
745 "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
746 pkt->ptBase, pkt->completionSignal);
747
748 mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
749
750 delete pkt;
751 decodeNext(q);
752}
753
754void
756{
757 DPRINTF(PM4PacketProcessor, "PM4 run_list base: %p size: %d\n",
758 pkt->ibBase, pkt->ibSize);
759
760 q->incRptr(sizeof(PM4RunList));
761
762 q->ib(true);
763 q->ibBase(pkt->ibBase);
764 q->rptr(0);
765 q->wptr(pkt->ibSize * sizeof(uint32_t));
766
767 delete pkt;
768 decodeNext(q);
769}
770
771void
773{
774 DPRINTF(PM4PacketProcessor, "PM4 indirect buffer, base: %p.\n",
775 pkt->ibBase);
776
777 q->incRptr(sizeof(PM4IndirectBuf));
778
779 q->ib(true);
780 q->ibBase(pkt->ibBase);
781 q->wptr(pkt->ibSize * sizeof(uint32_t));
782
783 delete pkt;
784 decodeNext(q);
785}
786
787void
789{
790 q->incRptr(sizeof(PM4SwitchBuf));
791
792 q->ib(true);
793 DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n",
794 q->wptr());
795
796 delete pkt;
797 decodeNext(q);
798}
799
800void
802{
803 q->incRptr(sizeof(PM4SetUconfigReg));
804
805 DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
806 pkt->offset, pkt->data);
807
808 // SET_UCONFIG_REG_START and pkt->offset are dword addresses
809 uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;
810
811 // Additional CPs respond to addresses 0x40000 apart.
812 reg_addr += 0x40000 * getIpId();
813 gpuDevice->setRegVal(reg_addr, pkt->data);
814
815 delete pkt;
816 decodeNext(q);
817}
818
819void
821{
822 q->incRptr(sizeof(PM4WaitRegMem));
823
824 DPRINTF(PM4PacketProcessor, "PM4 WAIT_REG_MEM\nfunc: %d memSpace: %d op: "
825 "%d\n", pkt->function, pkt->memSpace, pkt->operation);
826 DPRINTF(PM4PacketProcessor, " AddrLo/Reg1: %lx\n", pkt->memAddrLo);
827 DPRINTF(PM4PacketProcessor, " AddrHi/Reg2: %lx\n", pkt->memAddrHi);
828 DPRINTF(PM4PacketProcessor, " Reference: %lx\n", pkt->reference);
829 DPRINTF(PM4PacketProcessor, " Mask: %lx\n", pkt->mask);
830 DPRINTF(PM4PacketProcessor, " Poll Interval: %lx\n", pkt->pollInterval);
831
832 delete pkt;
833 decodeNext(q);
834}
835
836void
838{
839 q->incRptr(sizeof(PM4QueryStatus));
840
841 DPRINTF(PM4PacketProcessor, "PM4 query status contextId: %d, interruptSel:"
842 " %d command: %d, pasid: %d, doorbellOffset: %d, engineSel: %d "
843 "addr: %lx, data: %lx\n", pkt->contextId, pkt->interruptSel,
844 pkt->command, pkt->pasid, pkt->doorbellOffset, pkt->engineSel,
845 pkt->addr, pkt->data);
846
847 if (pkt->interruptSel == 0 && pkt->command == 2) {
848 // Write data value to fence address
849 Addr addr = getGARTAddr(pkt->addr);
850 DPRINTF(PM4PacketProcessor, "Using GART addr %lx\n", addr);
851 auto cb = new DmaVirtCallback<uint64_t>(
852 [ = ] (const uint64_t &) { queryStatusDone(q, pkt); }, pkt->data);
853 dmaWriteVirt(addr, sizeof(uint64_t), cb, &cb->dmaBuffer);
854 } else {
855 // No other combinations used in amdkfd v9
856 panic("query_status with interruptSel %d command %d not supported",
857 pkt->interruptSel, pkt->command);
858 }
859}
860
861void
863{
864 DPRINTF(PM4PacketProcessor, "PM4 query status complete\n");
865
866 delete pkt;
867 decodeNext(q);
868}
869
870void
872{
873 switch (mmio_offset) {
874 /* Hardware queue descriptor (HQD) registers */
875 case mmCP_HQD_VMID:
876 setHqdVmid(pkt->getLE<uint32_t>());
877 break;
878 case mmCP_HQD_ACTIVE:
879 setHqdActive(pkt->getLE<uint32_t>());
880 break;
881 case mmCP_HQD_PQ_BASE:
882 setHqdPqBase(pkt->getLE<uint32_t>());
883 break;
885 setHqdPqBaseHi(pkt->getLE<uint32_t>());
886 break;
888 setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
889 gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
890 break;
891 case mmCP_HQD_PQ_RPTR:
892 setHqdPqPtr(pkt->getLE<uint32_t>());
893 break;
895 setHqdPqWptrLo(pkt->getLE<uint32_t>());
896 break;
898 setHqdPqWptrHi(pkt->getLE<uint32_t>());
899 break;
901 setHqdPqRptrReportAddr(pkt->getLE<uint32_t>());
902 break;
904 setHqdPqRptrReportAddrHi(pkt->getLE<uint32_t>());
905 break;
907 setHqdPqWptrPollAddr(pkt->getLE<uint32_t>());
908 break;
910 setHqdPqWptrPollAddrHi(pkt->getLE<uint32_t>());
911 break;
913 setHqdPqControl(pkt->getLE<uint32_t>());
914 break;
916 setHqdIbCtrl(pkt->getLE<uint32_t>());
917 break;
918 /* Ring buffer registers */
919 case mmCP_RB_VMID:
920 setRbVmid(pkt->getLE<uint32_t>());
921 break;
922 case mmCP_RB0_CNTL:
923 setRbCntl(pkt->getLE<uint32_t>());
924 break;
925 case mmCP_RB0_WPTR:
926 setRbWptrLo(pkt->getLE<uint32_t>());
927 break;
928 case mmCP_RB0_WPTR_HI:
929 setRbWptrHi(pkt->getLE<uint32_t>());
930 break;
932 setRbRptrAddrLo(pkt->getLE<uint32_t>());
933 break;
935 setRbRptrAddrHi(pkt->getLE<uint32_t>());
936 break;
938 setRbWptrPollAddrLo(pkt->getLE<uint32_t>());
939 break;
941 setRbWptrPollAddrHi(pkt->getLE<uint32_t>());
942 break;
943 case mmCP_RB0_BASE:
944 setRbBaseLo(pkt->getLE<uint32_t>());
945 break;
946 case mmCP_RB0_BASE_HI:
947 setRbBaseHi(pkt->getLE<uint32_t>());
948 break;
950 setRbDoorbellCntrl(pkt->getLE<uint32_t>());
951 gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
952 break;
954 setRbDoorbellRangeLo(pkt->getLE<uint32_t>());
955 break;
957 setRbDoorbellRangeHi(pkt->getLE<uint32_t>());
958 break;
959 default:
960 break;
961 }
962}
963
964void
966{
967 kiq.hqd_vmid = data;
968}
969
970void
972{
973 kiq.hqd_active = data;
974}
975
976void
978{
979 kiq.hqd_pq_base_lo = data;
980}
981
982void
984{
985 kiq.hqd_pq_base_hi = data;
986}
987
988void
990{
991 kiq.hqd_pq_doorbell_control = data;
992}
993
994void
996{
997 kiq.rptr = data;
998}
999
1000void
1002{
1003 /* Write pointer communicated through doorbell value. */
1004}
1005
1006void
1008{
1009 /* Write pointer communicated through doorbell value. */
1010}
1011
1012void
1014{
1015 kiq.hqd_pq_rptr_report_addr_lo = data;
1016}
1017
1018void
1020{
1021 kiq.hqd_pq_rptr_report_addr_hi = data;
1022}
1023
1024void
1026{
1027 kiq.hqd_pq_wptr_poll_addr_lo = data;
1028}
1029
1030void
1032{
1033 kiq.hqd_pq_wptr_poll_addr_hi = data;
1034}
1035
1036void
1038{
1039 kiq.hqd_pq_control = data;
1040}
1041
1042void
1044{
1045 kiq.hqd_ib_control = data;
1046}
1047
1048void
1050{
1051 pq.hqd_vmid = data;
1052}
1053
1054void
1056{
1057 pq.hqd_pq_control = data;
1058}
1059
1060void
1062{
1063 pq.queueWptrLo = data;
1064}
1065
1066void
1068{
1069 pq.queueWptrHi = data;
1070}
1071
1072void
1074{
1075 pq.queueRptrAddrLo = data;
1076}
1077
1078void
1080{
1081 pq.queueRptrAddrHi = data;
1082}
1083
1084void
1086{
1087 pq.hqd_pq_wptr_poll_addr_lo = data;
1088}
1089
1090void
1092{
1093 pq.hqd_pq_wptr_poll_addr_hi = data;
1094}
1095
1096void
1098{
1099 pq.hqd_pq_base_lo = data;
1100}
1101
1102void
1104{
1105 pq.hqd_pq_base_hi = data;
1106}
1107
1108void
1110{
1111 pq.hqd_pq_doorbell_control = data;
1112 pq.doorbellOffset = data & 0x1ffffffc;
1113}
1114
1115void
1117{
1118 pq.doorbellRangeLo = data;
1119}
1120
1121void
1123{
1124 pq.doorbellRangeHi = data;
1125}
1126
1127void
1129{
1130 // Serialize the DmaVirtDevice base class
1132
1133 int num_queues = queues.size();
1134 auto id = std::make_unique<Addr[]>(num_queues);
1135 auto mqd_base = std::make_unique<Addr[]>(num_queues);
1136 auto mqd_read_index = std::make_unique<uint64_t[]>(num_queues);
1137 auto base = std::make_unique<Addr[]>(num_queues);
1138 auto rptr = std::make_unique<Addr[]>(num_queues);
1139 auto wptr = std::make_unique<Addr[]>(num_queues);
1140 auto ib_base = std::make_unique<Addr[]>(num_queues);
1141 auto ib_rptr = std::make_unique<Addr[]>(num_queues);
1142 auto ib_wptr = std::make_unique<Addr[]>(num_queues);
1143 auto offset = std::make_unique<Addr[]>(num_queues);
1144 auto processing = std::make_unique<bool[]>(num_queues);
1145 auto ib = std::make_unique<bool[]>(num_queues);
1146 auto me = std::make_unique<uint32_t[]>(num_queues);
1147 auto pipe = std::make_unique<uint32_t[]>(num_queues);
1148 auto queue = std::make_unique<uint32_t[]>(num_queues);
1149 auto privileged = std::make_unique<bool[]>(num_queues);
1150 auto queue_type = std::make_unique<uint32_t[]>(num_queues);
1151 auto hqd_active = std::make_unique<uint32_t[]>(num_queues);
1152 auto hqd_vmid = std::make_unique<uint32_t[]>(num_queues);
1153 auto aql_rptr = std::make_unique<Addr[]>(num_queues);
1154 auto aql = std::make_unique<uint32_t[]>(num_queues);
1155 auto doorbell = std::make_unique<uint32_t[]>(num_queues);
1156 auto hqd_pq_control = std::make_unique<uint32_t[]>(num_queues);
1157
1158 int i = 0;
1159 for (auto iter : queues) {
1160 PM4Queue *q = iter.second;
1161 id[i] = q->id();
1162 mqd_base[i] = q->mqdBase();
1163 mqd_read_index[i] = q->getMQD()->mqdReadIndex;
1164 bool cur_state = q->ib();
1165 q->ib(false);
1166 base[i] = q->base();
1167 rptr[i] = q->getRptr();
1168 wptr[i] = q->getWptr();
1169 q->ib(true);
1170 ib_base[i] = q->ibBase();
1171 ib_rptr[i] = q->getRptr();
1172 ib_wptr[i] = q->getWptr();
1173 q->ib(cur_state);
1174 offset[i] = q->offset();
1175 processing[i] = q->processing();
1176 ib[i] = q->ib();
1177 me[i] = q->me();
1178 pipe[i] = q->pipe();
1179 queue[i] = q->queue();
1180 privileged[i] = q->privileged();
1181 queue_type[i] = q->queueType();
1182 hqd_active[i] = q->getMQD()->hqd_active;
1183 hqd_vmid[i] = q->getMQD()->hqd_vmid;
1184 aql_rptr[i] = q->getMQD()->aqlRptr;
1185 aql[i] = q->getMQD()->aql;
1186 doorbell[i] = q->getMQD()->doorbell;
1187 hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
1188 i++;
1189 }
1190
1191 SERIALIZE_SCALAR(num_queues);
1192 SERIALIZE_UNIQUE_PTR_ARRAY(id, num_queues);
1193 SERIALIZE_UNIQUE_PTR_ARRAY(mqd_base, num_queues);
1194 SERIALIZE_UNIQUE_PTR_ARRAY(mqd_read_index, num_queues);
1195 SERIALIZE_UNIQUE_PTR_ARRAY(base, num_queues);
1196 SERIALIZE_UNIQUE_PTR_ARRAY(rptr, num_queues);
1197 SERIALIZE_UNIQUE_PTR_ARRAY(wptr, num_queues);
1198 SERIALIZE_UNIQUE_PTR_ARRAY(ib_base, num_queues);
1199 SERIALIZE_UNIQUE_PTR_ARRAY(ib_rptr, num_queues);
1200 SERIALIZE_UNIQUE_PTR_ARRAY(ib_wptr, num_queues);
1202 SERIALIZE_UNIQUE_PTR_ARRAY(processing, num_queues);
1203 SERIALIZE_UNIQUE_PTR_ARRAY(ib, num_queues);
1204 SERIALIZE_UNIQUE_PTR_ARRAY(me, num_queues);
1205 SERIALIZE_UNIQUE_PTR_ARRAY(pipe, num_queues);
1206 SERIALIZE_UNIQUE_PTR_ARRAY(queue, num_queues);
1207 SERIALIZE_UNIQUE_PTR_ARRAY(privileged, num_queues);
1208 SERIALIZE_UNIQUE_PTR_ARRAY(queue_type, num_queues);
1209 SERIALIZE_UNIQUE_PTR_ARRAY(hqd_active, num_queues);
1210 SERIALIZE_UNIQUE_PTR_ARRAY(hqd_vmid, num_queues);
1211 SERIALIZE_UNIQUE_PTR_ARRAY(aql_rptr, num_queues);
1212 SERIALIZE_UNIQUE_PTR_ARRAY(aql, num_queues);
1213 SERIALIZE_UNIQUE_PTR_ARRAY(doorbell, num_queues);
1214 SERIALIZE_UNIQUE_PTR_ARRAY(hqd_pq_control, num_queues);
1215}
1216
1217void
1219{
1220 // Serialize the DmaVirtDevice base class
1222
1223 int num_queues = 0;
1224 UNSERIALIZE_SCALAR(num_queues);
1225
1226 auto id = std::make_unique<Addr[]>(num_queues);
1227 auto mqd_base = std::make_unique<Addr[]>(num_queues);
1228 auto mqd_read_index = std::make_unique<uint64_t[]>(num_queues);
1229 auto base = std::make_unique<Addr[]>(num_queues);
1230 auto rptr = std::make_unique<Addr[]>(num_queues);
1231 auto wptr = std::make_unique<Addr[]>(num_queues);
1232 auto ib_base = std::make_unique<Addr[]>(num_queues);
1233 auto ib_rptr = std::make_unique<Addr[]>(num_queues);
1234 auto ib_wptr = std::make_unique<Addr[]>(num_queues);
1235 auto offset = std::make_unique<Addr[]>(num_queues);
1236 auto processing = std::make_unique<bool[]>(num_queues);
1237 auto ib = std::make_unique<bool[]>(num_queues);
1238 auto me = std::make_unique<uint32_t[]>(num_queues);
1239 auto pipe = std::make_unique<uint32_t[]>(num_queues);
1240 auto queue = std::make_unique<uint32_t[]>(num_queues);
1241 auto privileged = std::make_unique<bool[]>(num_queues);
1242 auto queue_type = std::make_unique<uint32_t[]>(num_queues);
1243 auto hqd_active = std::make_unique<uint32_t[]>(num_queues);
1244 auto hqd_vmid = std::make_unique<uint32_t[]>(num_queues);
1245 auto aql_rptr = std::make_unique<Addr[]>(num_queues);
1246 auto aql = std::make_unique<uint32_t[]>(num_queues);
1247 auto doorbell = std::make_unique<uint32_t[]>(num_queues);
1248 auto hqd_pq_control = std::make_unique<uint32_t[]>(num_queues);
1249
1250 UNSERIALIZE_UNIQUE_PTR_ARRAY(id, num_queues);
1251 UNSERIALIZE_UNIQUE_PTR_ARRAY(mqd_base, num_queues);
1252 UNSERIALIZE_UNIQUE_PTR_ARRAY(mqd_read_index, num_queues);
1254 UNSERIALIZE_UNIQUE_PTR_ARRAY(rptr, num_queues);
1255 UNSERIALIZE_UNIQUE_PTR_ARRAY(wptr, num_queues);
1256 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib_base, num_queues);
1257 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib_rptr, num_queues);
1258 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib_wptr, num_queues);
1260 UNSERIALIZE_UNIQUE_PTR_ARRAY(processing, num_queues);
1261 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib, num_queues);
1262 UNSERIALIZE_UNIQUE_PTR_ARRAY(me, num_queues);
1263 UNSERIALIZE_UNIQUE_PTR_ARRAY(pipe, num_queues);
1264 UNSERIALIZE_UNIQUE_PTR_ARRAY(queue, num_queues);
1265 UNSERIALIZE_UNIQUE_PTR_ARRAY(privileged, num_queues);
1266 UNSERIALIZE_UNIQUE_PTR_ARRAY(queue_type, num_queues);
1267 UNSERIALIZE_UNIQUE_PTR_ARRAY(hqd_active, num_queues);
1268 UNSERIALIZE_UNIQUE_PTR_ARRAY(hqd_vmid, num_queues);
1269 UNSERIALIZE_UNIQUE_PTR_ARRAY(aql_rptr, num_queues);
1270 UNSERIALIZE_UNIQUE_PTR_ARRAY(aql, num_queues);
1271 UNSERIALIZE_UNIQUE_PTR_ARRAY(doorbell, num_queues);
1272 UNSERIALIZE_UNIQUE_PTR_ARRAY(hqd_pq_control, num_queues);
1273
1274 for (int i = 0; i < num_queues; i++) {
1275 QueueDesc *mqd = new QueueDesc();
1276 memset(mqd, 0, sizeof(QueueDesc));
1277
1278 mqd->mqdBase = mqd_base[i] >> 8;
1279 mqd->mqdReadIndex = mqd_read_index[i];
1280 mqd->base = base[i] >> 8;
1281 mqd->aql = aql[i];
1282
1283 PM4MapQueues* pkt = new PM4MapQueues;
1284 memset(pkt, 0, sizeof(PM4MapQueues));
1285 newQueue(mqd, offset[i], pkt, id[i]);
1286
1287 if (ib[i]) {
1288 queues[id[i]]->wptr(ib_wptr[i]);
1289 queues[id[i]]->rptr(ib_rptr[i]);
1290 } else {
1291 queues[id[i]]->rptr(rptr[i]);
1292 queues[id[i]]->wptr(wptr[i]);
1293 }
1294 queues[id[i]]->ib(ib[i]);
1295 queues[id[i]]->offset(offset[i]);
1296 queues[id[i]]->processing(processing[i]);
1297 queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i],
1298 queue_type[i]);
1299 queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
1300 queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];
1301 queues[id[i]]->getMQD()->aqlRptr = aql_rptr[i];
1302 queues[id[i]]->getMQD()->doorbell = doorbell[i];
1303 queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];
1304
1305 if (mqd->aql) {
1306 int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
1307 auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
1308 hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
1309 mqd_size, 8, GfxVersion::gfx900, offset[i],
1310 mqd_read_index[i]);
1311 }
1312
1313 DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
1314 queues[id[i]]->id(), queues[id[i]]->rptr(),
1315 queues[id[i]]->wptr());
1316 }
1317}
1318
1319} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
const char data[]
Device model for an AMD GPU.
Translation range generators.
Definition amdgpu_vm.hh:391
void serialize(CheckpointOut &cp) const override
Serialize an object.
void unserialize(CheckpointIn &cp) override
Unserialize an object.
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
DmaVirtDevice(const Params &p)
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void writeMMIO(PacketPtr pkt, Addr mmio_offset)
void setRbWptrPollAddrLo(uint32_t data)
void decodeHeader(PM4Queue *q, PM4Header header)
This method calls other PM4 packet processing methods based on the header of a PM4 packet.
void unserialize(CheckpointIn &cp) override
Unserialize an object.
void mapKiq(Addr offset)
The first compute queue, the Kernel Interface Queueu a.k.a.
Addr getGARTAddr(Addr addr) const
void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
void switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt)
void setGPUDevice(AMDGPUDevice *gpu_device)
void serialize(CheckpointOut &cp) const override
Serialize an object.
std::unordered_map< uint32_t, PM4Queue * > queuesMap
void setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
void queryStatus(PM4Queue *q, PM4QueryStatus *pkt)
void releaseMem(PM4Queue *q, PM4ReleaseMem *pkt)
void releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
void setHqdPqRptrReportAddr(uint32_t data)
void updateReadIndex(Addr offset, uint64_t rd_idx)
Update read index on doorbell rings.
void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, SDMAQueueDesc *mqd, uint16_t vmid)
void process(PM4Queue *q, Addr wptrOffset)
This method start processing a PM4Queue from the current read pointer to the newly communicated write...
void setHqdPqControl(uint32_t data)
void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
void setRbRptrAddrHi(uint32_t data)
void setHqdPqWptrPollAddr(uint32_t data)
void newQueue(QueueDesc *q, Addr offset, PM4MapQueues *pkt=nullptr, int id=-1)
This method creates a new PM4Queue based on a queue descriptor and an offset.
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
void queryStatusDone(PM4Queue *q, PM4QueryStatus *pkt)
void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases)
void setRbDoorbellRangeLo(uint32_t data)
void waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt)
void setHqdPqBaseHi(uint32_t data)
void runList(PM4Queue *q, PM4RunList *pkt)
void decodeNext(PM4Queue *q)
This method decodes the next packet in a PM4Queue.
void mapPq(Addr offset)
The first graphics queue, the Primary Queueu a.k.a.
void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
void setHqdPqDoorbellCtrl(uint32_t data)
void setRbDoorbellRangeHi(uint32_t data)
void doneMQDWrite(Addr mqdAddr, Addr addr)
std::unordered_map< uint16_t, PM4Queue * > queues
void indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt)
PM4PacketProcessor(const PM4PacketProcessorParams &p)
void unmapAllQueues(bool unmap_static)
void setHqdPqRptrReportAddrHi(uint32_t data)
void mapQueues(PM4Queue *q, PM4MapQueues *pkt)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Method for functional translation.
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd, uint16_t vmid)
void setRbRptrAddrLo(uint32_t data)
void setRbDoorbellCntrl(uint32_t data)
PM4Queue * getQueue(Addr offset, bool gfx=false)
Based on an offset communicated through doorbell write, the PM4PacketProcessor identifies which queue...
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void setHqdPqWptrPollAddrHi(uint32_t data)
void setRbWptrPollAddrHi(uint32_t data)
Class defining a PM4 queue.
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
System DMA Engine class for AMD dGPU.
void registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd, bool isStatic)
Methods for RLC queues.
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define UNSERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Definition serialize.hh:634
#define SERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Definition serialize.hh:626
#define warn(...)
Definition logging.hh:288
Bitfield< 27 > q
Definition misc_types.hh:55
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 33 > id
Bitfield< 0 > p
Bitfield< 12 > me
Definition misc.hh:118
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
struct gem5::GEM5_PACKED PM4WriteData
struct gem5::GEM5_PACKED PM4WaitRegMem
std::ostream CheckpointOut
Definition serialize.hh:66
struct gem5::GEM5_PACKED PM4RunList
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ SOC15_IH_CLIENTID_GRBM_CP
struct gem5::GEM5_PACKED PM4ReleaseMem
struct gem5::GEM5_PACKED PM4SwitchBuf
Bitfield< 10 > pasid
Definition x86_cpu.cc:129
struct gem5::GEM5_PACKED PM4Header
PM4 packets.
struct gem5::GEM5_PACKED PM4MapQueues
struct gem5::GEM5_PACKED PM4MapProcess
Packet * PacketPtr
struct gem5::GEM5_PACKED PM4MapProcessV2
struct gem5::GEM5_PACKED SDMAQueueDesc
Queue descriptor for SDMA-based user queues (RLC queues).
struct gem5::GEM5_PACKED PM4UnmapQueues
std::unique_ptr< TranslationGen > TranslationGenPtr
struct gem5::GEM5_PACKED PM4SetUconfigReg
@ IT_RELEASE_MEM
@ IT_WRITE_DATA
@ IT_RUN_LIST
@ IT_MAP_QUEUES
@ IT_SET_UCONFIG_REG
@ IT_MAP_PROCESS
@ IT_INVALIDATE_TLBS
@ IT_QUERY_STATUS
@ IT_WAIT_REG_MEM
@ IT_UNMAP_QUEUES
@ IT_INDIRECT_BUFFER
@ IT_SWITCH_BUFFER
struct gem5::GEM5_PACKED PM4QueryStatus
struct gem5::GEM5_PACKED QueueDesc
Queue descriptor with relevant MQD attributes.
struct gem5::GEM5_PACKED PM4IndirectBuf
@ HW_REG_SH_MEM_BASES
output header
Definition nop.cc:36
Declaration of the Packet class.
#define PACKET3_SET_UCONFIG_REG_START
Value from vega10/pm4_header.h.
#define mmCP_RB_DOORBELL_CONTROL
Definition pm4_mmio.hh:48
#define mmCP_RB0_RPTR_ADDR_HI
Definition pm4_mmio.hh:45
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR
Definition pm4_mmio.hh:59
#define mmCP_RB0_BASE_HI
Definition pm4_mmio.hh:51
#define mmCP_HQD_PQ_DOORBELL_CONTROL
Definition pm4_mmio.hh:57
#define mmCP_HQD_PQ_WPTR_POLL_ADDR
Definition pm4_mmio.hh:61
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI
Definition pm4_mmio.hh:60
#define mmCP_HQD_PQ_BASE
Definition pm4_mmio.hh:55
#define mmCP_RB_DOORBELL_RANGE_UPPER
Definition pm4_mmio.hh:50
#define mmCP_HQD_IB_CONTROL
Definition pm4_mmio.hh:64
#define mmCP_RB0_BASE
Definition pm4_mmio.hh:39
#define mmCP_HQD_VMID
Definition pm4_mmio.hh:54
#define mmCP_RB_WPTR_POLL_ADDR_LO
Definition pm4_mmio.hh:41
#define mmCP_HQD_PQ_RPTR
Definition pm4_mmio.hh:58
#define mmCP_HQD_ACTIVE
Definition pm4_mmio.hh:53
#define mmCP_RB_VMID
Definition pm4_mmio.hh:43
#define mmCP_HQD_PQ_BASE_HI
Definition pm4_mmio.hh:56
#define mmCP_RB0_WPTR_HI
Definition pm4_mmio.hh:47
#define mmCP_HQD_PQ_WPTR_HI
Definition pm4_mmio.hh:66
#define mmCP_HQD_PQ_CONTROL
Definition pm4_mmio.hh:63
#define mmCP_RB_DOORBELL_RANGE_LOWER
Definition pm4_mmio.hh:49
#define mmCP_RB_WPTR_POLL_ADDR_HI
Definition pm4_mmio.hh:42
#define mmCP_RB0_CNTL
Definition pm4_mmio.hh:40
#define mmCP_RB0_RPTR_ADDR
Definition pm4_mmio.hh:44
#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI
Definition pm4_mmio.hh:62
#define mmCP_RB0_WPTR
Definition pm4_mmio.hh:46
#define mmCP_HQD_PQ_WPTR_LO
Definition pm4_mmio.hh:65
#define UNSERIALIZE_SCALAR(scalar)
Definition serialize.hh:575
#define SERIALIZE_SCALAR(scalar)
Definition serialize.hh:568
uint32_t sdmax_rlcx_ib_base_lo
uint32_t sdmax_rlcx_rb_rptr
uint32_t doorbellOffset0
uint32_t sdmax_rlcx_rb_rptr_addr_hi
uint32_t sdmax_rlcx_rb_cntl
uint32_t doorbellOffset3
uint32_t sdmax_rlcx_rb_wptr_hi
uint32_t doorbellOffset2
uint32_t sdmax_rlcx_ib_base_hi
uint32_t doorbellOffset
uint32_t processQuantum
uint32_t hqd_pq_control
uint32_t hqd_active
Definition pm4_queues.hh:96
uint32_t sdmax_rlcx_rb_rptr_addr_lo
uint32_t sdmax_rlcx_rb_wptr
uint32_t sdmax_rlcx_rb_rptr_hi
uint64_t mqdReadIndex
Definition pm4_queues.hh:55
uint32_t doorbellOffset1
uint64_t completionSignal

Generated on Mon Oct 27 2025 04:13:01 for gem5 by doxygen 1.14.0