gem5 [DEVELOP-FOR-25.0]
Loading...
Searching...
No Matches
pm4_packet_processor.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2021 Advanced Micro Devices, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. Neither the name of the copyright holder nor the names of its
16 * contributors may be used to endorse or promote products derived from this
17 * software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
34
35#include "debug/PM4PacketProcessor.hh"
42#include "enums/GfxVersion.hh"
44#include "gpu-compute/shader.hh"
45#include "mem/packet.hh"
46#include "mem/packet_access.hh"
47
48namespace gem5
49{
50
51PM4PacketProcessor::PM4PacketProcessor(const PM4PacketProcessorParams &p)
52 : DmaVirtDevice(p), _ipId(p.ip_id), _mmioRange(p.mmio_range)
53{
54 memset(&kiq, 0, sizeof(QueueDesc));
55 memset(&pq, 0, sizeof(QueueDesc));
56}
57
65{
66 if (gpuDevice->getVM().inAGP(vaddr)) {
67 // Use AGP translation gen
68 return TranslationGenPtr(
69 new AMDGPUVM::AGPTranslationGen(&gpuDevice->getVM(), vaddr, size));
70 }
71
72 // Assume GART otherwise as this is the only other translation aperture
73 // available to the PM4 packet processor.
74 return TranslationGenPtr(
75 new AMDGPUVM::GARTTranslationGen(&gpuDevice->getVM(), vaddr, size));
76}
77
80{
81 AddrRangeList ranges;
82 return ranges;
83}
84
85void
87{
88 gpuDevice = gpu_device;
89}
90
91Addr
93{
94 if (!gpuDevice->getVM().inAGP(addr)) {
95 Addr low_bits = bits(addr, 11, 0);
96 addr = (((addr >> 12) << 3) << 12) | low_bits;
97 }
98 return addr;
99}
100
101PM4Queue *
103{
104 auto result = queuesMap.find(offset);
105 if (result == queuesMap.end()) {
106 if (gfx)
107 mapPq(offset);
108 else
109 mapKiq(offset);
110 return queuesMap[offset];
111 }
112 return result->second;
113}
114
115void
121
122void
128
129void
131 PM4MapQueues *pkt, int id)
132{
133 if (id == -1)
134 id = queues.size();
135
136 /* 256 bytes aligned address */
137 mqd->base <<= 8;
138 PM4Queue *q = new PM4Queue(id, mqd, offset, pkt);
139
140 queuesMap[offset] = q;
141 queues[id] = q;
142
143 /* we are assumming only compute queues can be map from MQDs */
144 QueueType qt;
145 qt = mqd->aql ? QueueType::ComputeAQL
147 gpuDevice->setDoorbellType(offset, qt, getIpId());
148
149 DPRINTF(PM4PacketProcessor, "New PM4 queue %d, base: %p offset: %p, me: "
150 "%d, pipe %d queue: %d size: %d\n", id, q->base(), q->offset(),
151 q->me(), q->pipe(), q->queue(), q->size());
152}
153
154void
156{
157 q->wptr(wptrOffset * sizeof(uint32_t));
158
159 if (!q->processing()) {
160 q->processing(true);
161 decodeNext(q);
162 }
163}
164
165void
167{
168 DPRINTF(PM4PacketProcessor, "PM4 decode queue %d rptr %p, wptr %p\n",
169 q->id(), q->rptr(), q->wptr());
170
171 if (q->rptr() != q->wptr()) {
172 /* Additional braces here are needed due to a clang compilation bug
173 falsely throwing a "suggest braces around initialization of
174 subject" error. More info on this bug is available here:
175 https://stackoverflow.com/questions/31555584
176 */
177 PM4Header h{{{0, 0, 0, 0, 0, 0}}};
178 auto cb = new DmaVirtCallback<PM4Header>(
179 [ = ] (PM4Header header)
180 { decodeHeader(q, header); }, h);
181 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(uint32_t), cb,
182 &cb->dmaBuffer);
183 } else {
184 // Reached the end of processable data in the queue. Switch out of IB
185 // if this is an indirect buffer.
186 assert(q->rptr() == q->wptr());
187 q->processing(false);
188 if (q->ib()) {
189 q->ib(false);
190 decodeNext(q);
191 }
192
193 // Write back rptr when the queue is empty. For static queues which
194 // are not unmapped, this is how the driver knows there is enough
195 // space in the queue to continue writing packets to the ring buffer.
196 if (q->getMQD()->aqlRptr) {
197 Addr addr = getGARTAddr(q->getMQD()->aqlRptr);
198 uint32_t *data = new uint32_t;
199 // gem5 stores rptr as a bytes offset while the driver expects
200 // a dword offset. Convert the offset to dword count.
201 *data = q->getRptr() >> 2;
202 auto cb = new DmaVirtCallback<uint32_t>(
203 [data](const uint32_t &) { delete data; });
204 dmaWriteVirt(addr, sizeof(uint32_t), cb, data);
205 }
206 }
207}
208
209void
211{
212 DPRINTF(PM4PacketProcessor, "PM4 packet %p\n", header.opcode);
213
214 q->incRptr(sizeof(PM4Header));
215
216 DmaVirtCallback<uint64_t> *cb = nullptr;
217 void *dmaBuffer = nullptr;
218
219 switch(header.opcode) {
220 case IT_NOP: {
221 DPRINTF(PM4PacketProcessor, "PM4 nop, count %p\n", header.count);
222 DPRINTF(PM4PacketProcessor, "rptr %p wptr %p\n", q->rptr(), q->wptr());
223 if (header.count != 0x3fff) {
224 q->incRptr((header.count + 1) * sizeof(uint32_t));
225 }
226 decodeNext(q);
227 } break;
228 case IT_WRITE_DATA: {
229 dmaBuffer = new PM4WriteData();
230 DPRINTF(PM4PacketProcessor, "PM4 writeData header: %x, count: %d\n",
231 header.ordinal, header.count);
233 [ = ] (const uint64_t &)
234 { writeData(q, (PM4WriteData *)dmaBuffer, header); });
235 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WriteData), cb,
236 dmaBuffer);
237 } break;
238
239 case IT_MAP_QUEUES: {
240 dmaBuffer = new PM4MapQueues();
242 [ = ] (const uint64_t &)
243 { mapQueues(q, (PM4MapQueues *)dmaBuffer); });
244 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapQueues), cb,
245 dmaBuffer);
246 } break;
247
248 case IT_RELEASE_MEM: {
249 dmaBuffer = new PM4ReleaseMem();
251 [ = ] (const uint64_t &)
252 { releaseMem(q, (PM4ReleaseMem *)dmaBuffer); });
253 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4ReleaseMem), cb,
254 dmaBuffer);
255 } break;
256
257 case IT_INDIRECT_BUFFER: {
258 dmaBuffer = new PM4IndirectBuf();
260 [ = ] (const uint64_t &)
261 { indirectBuffer(q, (PM4IndirectBuf *)dmaBuffer); });
262 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4IndirectBuf), cb,
263 dmaBuffer);
264 } break;
265
266 case IT_SWITCH_BUFFER: {
267 dmaBuffer = new PM4SwitchBuf();
269 [ = ] (const uint64_t &)
270 { switchBuffer(q, (PM4SwitchBuf *)dmaBuffer); });
271 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4SwitchBuf), cb,
272 dmaBuffer);
273 } break;
274
275 case IT_SET_UCONFIG_REG: {
276 dmaBuffer = new PM4SetUconfigReg();
278 [ = ] (const uint64_t &)
279 { setUconfigReg(q, (PM4SetUconfigReg *)dmaBuffer); });
280 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4SetUconfigReg), cb,
281 dmaBuffer);
282 } break;
283
284 case IT_WAIT_REG_MEM: {
285 dmaBuffer = new PM4WaitRegMem();
287 [ = ] (const uint64_t &)
288 { waitRegMem(q, (PM4WaitRegMem *)dmaBuffer); });
289 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4WaitRegMem), cb,
290 dmaBuffer);
291 } break;
292 case IT_MAP_PROCESS: {
293 if (gpuDevice->getGfxVersion() == GfxVersion::gfx90a ||
294 gpuDevice->getGfxVersion() == GfxVersion::gfx942) {
295 dmaBuffer = new PM4MapProcessV2();
297 [ = ] (const uint64_t &)
298 { mapProcessV2(q, (PM4MapProcessV2 *)dmaBuffer); });
299 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcessV2),
300 cb, dmaBuffer);
301 } else {
302 dmaBuffer = new PM4MapProcess();
304 [ = ] (const uint64_t &)
305 { mapProcessV1(q, (PM4MapProcess *)dmaBuffer); });
306 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4MapProcess), cb,
307 dmaBuffer);
308 }
309 } break;
310
311 case IT_UNMAP_QUEUES: {
312 dmaBuffer = new PM4UnmapQueues();
314 [ = ] (const uint64_t &)
315 { unmapQueues(q, (PM4UnmapQueues *)dmaBuffer); });
316 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4UnmapQueues), cb,
317 dmaBuffer);
318 } break;
319
320 case IT_RUN_LIST: {
321 dmaBuffer = new PM4RunList();
323 [ = ] (const uint64_t &)
324 { runList(q, (PM4RunList *)dmaBuffer); });
325 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4RunList), cb,
326 dmaBuffer);
327 } break;
328
329 case IT_QUERY_STATUS: {
330 dmaBuffer = new PM4QueryStatus();
332 [ = ] (const uint64_t &)
333 { queryStatus(q, (PM4QueryStatus *)dmaBuffer); });
334 dmaReadVirt(getGARTAddr(q->rptr()), sizeof(PM4QueryStatus), cb,
335 dmaBuffer);
336 } break;
337
338 case IT_INVALIDATE_TLBS: {
339 DPRINTF(PM4PacketProcessor, "Functionaly invalidating all TLBs\n");
340 gpuDevice->getVM().invalidateTLBs();
341 q->incRptr((header.count + 1) * sizeof(uint32_t));
342 decodeNext(q);
343 } break;
344
345 default: {
346 warn("PM4 packet opcode 0x%x not supported.\n", header.opcode);
347 DPRINTF(PM4PacketProcessor, "PM4 packet opcode 0x%x not supported.\n",
348 header.opcode);
349 q->incRptr((header.count + 1) * sizeof(uint32_t));
350 decodeNext(q);
351 } break;
352 }
353}
354
355void
357{
358 q->incRptr(sizeof(PM4WriteData));
359
360 DPRINTF(PM4PacketProcessor, "PM4 write addr: %p data: %p destSel: %d "
361 "addrIncr: %d resume: %d writeConfirm: %d cachePolicy: %d\n",
362 pkt->destAddr, pkt->data, pkt->destSel, pkt->addrIncr,
363 pkt->resume, pkt->writeConfirm, pkt->cachePolicy);
364
365 if (pkt->destSel == 5) {
366 // Memory address destination
368
369 // This is a variable length packet. The size of the packet is in
370 // the header.count field and is set as Number Of Dwords - 1. This
371 // packet is 4 bytes minuimum meaning the count is minimum 3. To
372 // get the number of dwords of data subtract two from the count.
373 unsigned size = (header.count - 2) * sizeof(uint32_t);
374
375 DPRINTF(PM4PacketProcessor, "Writing %d bytes to %p\n", size, addr);
376 auto cb = new DmaVirtCallback<uint32_t>(
377 [ = ](const uint32_t &) { writeDataDone(q, pkt, addr); });
378 dmaWriteVirt(addr, size, cb, &pkt->data);
379
380 if (!pkt->writeConfirm) {
381 decodeNext(q);
382 }
383 } else if (pkt->destSel == 0) {
384 // Register dword address destination
385 Addr byte_addr = pkt->destAddr << 2;
386
387 gpuDevice->setRegVal(byte_addr, pkt->data);
388
389 // setRegVal is instant on the simulated device so we ignore write
390 // confirm.
391 delete pkt;
392 decodeNext(q);
393 } else {
394 fatal("Unknown PM4 writeData destination %d\n", pkt->destSel);
395 }
396}
397
398void
400{
401 DPRINTF(PM4PacketProcessor, "PM4 write completed to %p, %p.\n", addr,
402 pkt->data);
403
404 if (pkt->writeConfirm) {
405 decodeNext(q);
406 }
407
408 delete pkt;
409}
410
411void
413{
414 q->incRptr(sizeof(PM4MapQueues));
415
416 DPRINTF(PM4PacketProcessor, "MAPQueues queueSel: %d, vmid: %d, me: %d, "
417 "pipe: %d, queueSlot: %d, queueType: %d, allocFormat: %d, "
418 "engineSel: %d, numQueues: %d, checkDisable: %d, doorbellOffset:"
419 " %d, mqdAddr: %lx, wptrAddr: %lx\n", pkt->queueSel, pkt->vmid,
420 pkt->me, pkt->pipe, pkt->queueSlot, pkt->queueType,
421 pkt->allocFormat, pkt->engineSel, pkt->numQueues,
422 pkt->checkDisable, pkt->doorbellOffset, pkt->mqdAddr,
423 pkt->wptrAddr);
424
425 // Partially reading the mqd with an offset of 96 dwords
426 if (pkt->engineSel == 0 || pkt->engineSel == 1 || pkt->engineSel == 4) {
427 Addr addr = getGARTAddr(pkt->mqdAddr + 96 * sizeof(uint32_t));
428
430 "Mapping mqd from %p %p (vmid %d - last vmid %d).\n",
431 addr, pkt->mqdAddr, pkt->vmid, gpuDevice->lastVMID());
432
433 // The doorbellOffset is a dword address. We shift by two / multiply
434 // by four to get the byte address to match doorbell addresses in
435 // the GPU device.
436 gpuDevice->mapDoorbellToVMID(pkt->doorbellOffset << 2,
437 gpuDevice->lastVMID());
438
439 QueueDesc *mqd = new QueueDesc();
440 memset(mqd, 0, sizeof(QueueDesc));
441 auto cb = new DmaVirtCallback<uint32_t>(
442 [ = ] (const uint32_t &) {
443 processMQD(pkt, q, addr, mqd, gpuDevice->lastVMID()); });
444 dmaReadVirt(addr, sizeof(QueueDesc), cb, mqd);
445 } else if (pkt->engineSel == 2 || pkt->engineSel == 3) {
446 SDMAQueueDesc *sdmaMQD = new SDMAQueueDesc();
447 memset(sdmaMQD, 0, sizeof(SDMAQueueDesc));
448
449 // For SDMA we read the full MQD, so there is no offset calculation.
451
452 auto cb = new DmaVirtCallback<uint32_t>(
453 [ = ] (const uint32_t &) {
454 processSDMAMQD(pkt, q, addr, sdmaMQD,
455 gpuDevice->lastVMID()); });
456 dmaReadVirt(addr, sizeof(SDMAQueueDesc), cb, sdmaMQD);
457 } else {
458 panic("Unknown engine for MQD: %d\n", pkt->engineSel);
459 }
460}
461
462void
464 QueueDesc *mqd, uint16_t vmid)
465{
466 DPRINTF(PM4PacketProcessor, "MQDbase: %lx, active: %d, vmid: %d, base: "
467 "%lx, rptr: %x aqlPtr: %lx\n", mqd->mqdBase, mqd->hqd_active,
468 mqd->hqd_vmid, mqd->base, mqd->rptr, mqd->aqlRptr);
469
470 Addr offset = mqd->doorbell & 0x1ffffffc;
471 newQueue(mqd, offset, pkt);
472 PM4Queue *new_q = queuesMap[offset];
473 gpuDevice->insertQId(vmid, new_q->id());
474
475 if (mqd->aql) {
476 // The queue size is encoded in the cp_hqd_pq_control field in the
477 // kernel driver in the 6 lowest bits as log2(queue_size / 4) - 1
478 // number of dwords.
479 //
480 // https://github.com/RadeonOpenCompute/ROCK-Kernel-Driver/blob/
481 // roc-4.3.x/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c#L3561
482 //
483 // Queue size is then 2^(cp_hqd_pq_control[5:0] + 1) dword. Multiply
484 // by 4 to get the number of bytes as HSAPP expects.
485 int mqd_size = (1 << ((mqd->hqd_pq_control & 0x3f) + 1)) * 4;
486 auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
487 hsa_pp.setDeviceQueueDesc(mqd->aqlRptr, mqd->base, new_q->id(),
488 mqd_size, 8, GfxVersion::gfx900, offset,
489 mqd->mqdReadIndex);
490 }
491
492 DPRINTF(PM4PacketProcessor, "PM4 mqd read completed, base %p, mqd %p, "
493 "hqdAQL %d.\n", mqd->base, mqd->mqdBase, mqd->aql);
494
495 gpuDevice->processPendingDoorbells(offset);
496
497 delete pkt;
498 decodeNext(q);
499}
500
501void
503 SDMAQueueDesc *mqd, uint16_t vmid)
504{
505 uint32_t rlc_size = 4UL << bits(mqd->sdmax_rlcx_rb_cntl, 6, 1);
506 Addr rptr_wb_addr = mqd->sdmax_rlcx_rb_rptr_addr_hi;
507 rptr_wb_addr <<= 32;
508 rptr_wb_addr |= mqd->sdmax_rlcx_rb_rptr_addr_lo;
509
510 DPRINTF(PM4PacketProcessor, "SDMAMQD: rb base: %#lx rptr: %#x/%#x wptr: "
511 "%#x/%#x ib: %#x/%#x size: %d ctrl: %#x rptr wb addr: %#lx\n",
515 rlc_size, mqd->sdmax_rlcx_rb_cntl, rptr_wb_addr);
516
517 // Engine 2 points to SDMA0 while engine 3 points to SDMA1
518 assert(pkt->engineSel == 2 || pkt->engineSel == 3);
519 SDMAEngine *sdma_eng = gpuDevice->getSDMAById(pkt->engineSel - 2);
520
521 // Queue type 1 and 2 are "static" queues
522 bool is_static = (pkt->queueType == 2) || (pkt->queueType == 3);
523
524 // Register RLC queue with SDMA
525 sdma_eng->registerRLCQueue(pkt->doorbellOffset << 2, addr, mqd, is_static);
526
527 // Register doorbell with GPU device
528 gpuDevice->setSDMAEngine(pkt->doorbellOffset << 2, sdma_eng);
529 gpuDevice->setDoorbellType(pkt->doorbellOffset << 2, RLC, getIpId());
530
531 gpuDevice->processPendingDoorbells(pkt->doorbellOffset << 2);
532
533 delete pkt;
534 decodeNext(q);
535}
536
537void
539{
540 q->incRptr(sizeof(PM4ReleaseMem));
541
542 Addr addr = getGARTAddr(pkt->addr);
543 DPRINTF(PM4PacketProcessor, "PM4 release_mem event %d eventIdx %d intSel "
544 "%d destSel %d dataSel %d, address %p data %p, intCtx %p\n",
545 pkt->event, pkt->eventIdx, pkt->intSelect, pkt->destSelect,
546 pkt->dataSelect, addr, pkt->dataLo, pkt->intCtxId);
547
549 "PM4 release_mem destSel 0 bypasses caches to MC.\n");
550
551 if (pkt->dataSelect == 1) {
552 auto cb = new DmaVirtCallback<uint32_t>(
553 [ = ](const uint32_t &) { releaseMemDone(q, pkt, addr); },
554 pkt->dataLo);
555 dmaWriteVirt(addr, sizeof(uint32_t), cb, &cb->dmaBuffer);
556 } else {
557 panic("Unimplemented PM4ReleaseMem.dataSelect");
558 }
559}
560
561void
563{
564 DPRINTF(PM4PacketProcessor, "PM4 release_mem wrote %d to %p\n",
565 pkt->dataLo, addr);
566 if (pkt->intSelect == 2) {
567 DPRINTF(PM4PacketProcessor, "PM4 interrupt, id: %d ctx: %d, me: %d, "
568 "pipe: %d, queueSlot:%d\n", q->id(), pkt->intCtxId, q->me(),
569 q->pipe(), q->queue());
570
571 uint8_t ringId = 0;
572 if (q->id() != 0) {
573 ringId = (q->queue() << 4) | (q->me() << 2) | q->pipe();
574 }
575 gpuDevice->getIH()->prepareInterruptCookie(pkt->intCtxId, ringId,
577 2 * getIpId());
578 gpuDevice->getIH()->submitInterruptCookie();
579 }
580
581 delete pkt;
582 decodeNext(q);
583}
584
585void
587{
588 assert(queuesMap.count(offset));
589 queuesMap[offset]->getMQD()->mqdReadIndex = rd_idx;
590}
591
592void
594{
595 auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
596 for (auto iter : gpuDevice->getUsedVMIDs()) {
597 for (auto id : iter.second) {
598 assert(queues.count(id));
599
600 // Do not unmap KMD queues.
601 if (queues[id]->privileged()) {
602 continue;
603 }
604
605 // Do not unmap static queues if requested.
606 if (!unmap_static && queues[id]->isStatic()) {
607 continue;
608 }
609
610 QueueDesc *mqd = queues[id]->getMQD();
611 DPRINTF(PM4PacketProcessor, "Unmapping queue %d with read "
612 "index %ld\n", id, mqd->mqdReadIndex);
613
614 // Partially writing the mqd with an offset of 96 dwords as gem5
615 // does not use the full MQD and begins 96 dwords from the start
616 // of the full MQD structure. See src/dev/amdgpu/pm4_queues.hh.
617 Addr addr = getGARTAddr(queues[id]->mqdBase() +
618 96 * sizeof(uint32_t));
619 Addr mqd_base = queues[id]->mqdBase();
620 auto cb = new DmaVirtCallback<uint32_t>(
621 [ = ] (const uint32_t &) {
622 doneMQDWrite(mqd_base, addr);
623 });
624 mqd->base >>= 8;
625 dmaWriteVirt(addr, sizeof(QueueDesc), cb, mqd);
626 queues.erase(id);
627 hsa_pp.unsetDeviceQueueDesc(id, 8);
628 delete mqd;
629 }
630 }
631}
632
633void
635{
636 q->incRptr(sizeof(PM4UnmapQueues));
637
638 DPRINTF(PM4PacketProcessor, "PM4 unmap_queues queueSel: %d numQueues: %d "
639 "pasid: %p doorbellOffset0 %p \n",
640 pkt->queueSel, pkt->numQueues, pkt->pasid, pkt->doorbellOffset0);
641
642 switch (pkt->queueSel) {
643 case 0:
644 switch (pkt->numQueues) {
645 case 1:
646 gpuDevice->deallocateVmid(
647 gpuDevice->getVMID(pkt->doorbellOffset0));
648 gpuDevice->deallocateVmid(
649 gpuDevice->getVMID(pkt->doorbellOffset1));
650 gpuDevice->deallocateVmid(
651 gpuDevice->getVMID(pkt->doorbellOffset2));
652 gpuDevice->deallocateVmid(
653 gpuDevice->getVMID(pkt->doorbellOffset3));
654 break;
655 case 2:
656 gpuDevice->deallocateVmid(
657 gpuDevice->getVMID(pkt->doorbellOffset1));
658 gpuDevice->deallocateVmid(
659 gpuDevice->getVMID(pkt->doorbellOffset2));
660 gpuDevice->deallocateVmid(
661 gpuDevice->getVMID(pkt->doorbellOffset3));
662 break;
663 case 3:
664 gpuDevice->deallocateVmid(
665 gpuDevice->getVMID(pkt->doorbellOffset2));
666 gpuDevice->deallocateVmid(
667 gpuDevice->getVMID(pkt->doorbellOffset3));
668 break;
669 case 4:
670 gpuDevice->deallocateVmid(
671 gpuDevice->getVMID(pkt->doorbellOffset3));
672 break;
673 default:
674 panic("Unrecognized number of queues %d\n", pkt->numQueues);
675 }
676 break;
677 case 1:
678 gpuDevice->deallocatePasid(pkt->pasid);
679 break;
680 case 2:
681 unmapAllQueues(true);
682 gpuDevice->deallocateAllQueues(true);
683 break;
684 case 3:
685 unmapAllQueues(false);
686 gpuDevice->deallocateAllQueues(false);
687 break;
688 default:
689 panic("Unrecognized options\n");
690 break;
691 }
692
693 delete pkt;
694 decodeNext(q);
695}
696
697void
699 DPRINTF(PM4PacketProcessor, "PM4 unmap_queues MQD %p wrote to addr %p\n",
700 mqdAddr, addr);
701}
702
703void
704PM4PacketProcessor::mapProcess(uint32_t pasid, uint64_t ptBase,
705 uint32_t shMemBases)
706{
707 uint16_t vmid = gpuDevice->allocateVMID(pasid);
708
709 gpuDevice->getVM().setPageTableBase(vmid, ptBase);
710 gpuDevice->CP()->shader()->setHwReg(HW_REG_SH_MEM_BASES, shMemBases);
711
712 // Setup the apertures that gem5 uses. These values are bits [63:48].
713 Addr lds_base = (Addr)bits(shMemBases, 31, 16) << 48;
714 Addr scratch_base = (Addr)bits(shMemBases, 15, 0) << 48;
715
716 // There does not seem to be any register for the limit, but the driver
717 // assumes scratch and LDS have a 4GB aperture, so use that.
718 gpuDevice->CP()->shader()->setLdsApe(lds_base, lds_base + 0xFFFFFFFF);
719 gpuDevice->CP()->shader()->setScratchApe(scratch_base,
720 scratch_base + 0xFFFFFFFF);
721}
722
723void
725{
726 q->incRptr(sizeof(PM4MapProcess));
727
728 DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
729 "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
730 pkt->ptBase, pkt->completionSignal);
731
732 mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
733
734 delete pkt;
735 decodeNext(q);
736}
737
738void
740{
741 q->incRptr(sizeof(PM4MapProcessV2));
742
743 DPRINTF(PM4PacketProcessor, "PM4 map_process pasid: %p quantum: "
744 "%d pt: %p signal: %p\n", pkt->pasid, pkt->processQuantum,
745 pkt->ptBase, pkt->completionSignal);
746
747 mapProcess(pkt->pasid, pkt->ptBase, pkt->shMemBases);
748
749 delete pkt;
750 decodeNext(q);
751}
752
753void
755{
756 DPRINTF(PM4PacketProcessor, "PM4 run_list base: %p size: %d\n",
757 pkt->ibBase, pkt->ibSize);
758
759 q->incRptr(sizeof(PM4RunList));
760
761 q->ib(true);
762 q->ibBase(pkt->ibBase);
763 q->rptr(0);
764 q->wptr(pkt->ibSize * sizeof(uint32_t));
765
766 delete pkt;
767 decodeNext(q);
768}
769
770void
772{
773 DPRINTF(PM4PacketProcessor, "PM4 indirect buffer, base: %p.\n",
774 pkt->ibBase);
775
776 q->incRptr(sizeof(PM4IndirectBuf));
777
778 q->ib(true);
779 q->ibBase(pkt->ibBase);
780 q->wptr(pkt->ibSize * sizeof(uint32_t));
781
782 delete pkt;
783 decodeNext(q);
784}
785
786void
788{
789 q->incRptr(sizeof(PM4SwitchBuf));
790
791 q->ib(true);
792 DPRINTF(PM4PacketProcessor, "PM4 switching buffer, rptr: %p.\n",
793 q->wptr());
794
795 delete pkt;
796 decodeNext(q);
797}
798
799void
801{
802 q->incRptr(sizeof(PM4SetUconfigReg));
803
804 DPRINTF(PM4PacketProcessor, "SetUconfig offset %x data %x\n",
805 pkt->offset, pkt->data);
806
807 // SET_UCONFIG_REG_START and pkt->offset are dword addresses
808 uint32_t reg_addr = (PACKET3_SET_UCONFIG_REG_START + pkt->offset) * 4;
809
810 // Additional CPs respond to addresses 0x40000 apart.
811 reg_addr += 0x40000 * getIpId();
812 gpuDevice->setRegVal(reg_addr, pkt->data);
813
814 delete pkt;
815 decodeNext(q);
816}
817
818void
820{
821 q->incRptr(sizeof(PM4WaitRegMem));
822
823 DPRINTF(PM4PacketProcessor, "PM4 WAIT_REG_MEM\nfunc: %d memSpace: %d op: "
824 "%d\n", pkt->function, pkt->memSpace, pkt->operation);
825 DPRINTF(PM4PacketProcessor, " AddrLo/Reg1: %lx\n", pkt->memAddrLo);
826 DPRINTF(PM4PacketProcessor, " AddrHi/Reg2: %lx\n", pkt->memAddrHi);
827 DPRINTF(PM4PacketProcessor, " Reference: %lx\n", pkt->reference);
828 DPRINTF(PM4PacketProcessor, " Mask: %lx\n", pkt->mask);
829 DPRINTF(PM4PacketProcessor, " Poll Interval: %lx\n", pkt->pollInterval);
830
831 delete pkt;
832 decodeNext(q);
833}
834
835void
837{
838 q->incRptr(sizeof(PM4QueryStatus));
839
840 DPRINTF(PM4PacketProcessor, "PM4 query status contextId: %d, interruptSel:"
841 " %d command: %d, pasid: %d, doorbellOffset: %d, engineSel: %d "
842 "addr: %lx, data: %lx\n", pkt->contextId, pkt->interruptSel,
843 pkt->command, pkt->pasid, pkt->doorbellOffset, pkt->engineSel,
844 pkt->addr, pkt->data);
845
846 if (pkt->interruptSel == 0 && pkt->command == 2) {
847 // Write data value to fence address
848 Addr addr = getGARTAddr(pkt->addr);
849 DPRINTF(PM4PacketProcessor, "Using GART addr %lx\n", addr);
850 auto cb = new DmaVirtCallback<uint64_t>(
851 [ = ] (const uint64_t &) { queryStatusDone(q, pkt); }, pkt->data);
852 dmaWriteVirt(addr, sizeof(uint64_t), cb, &cb->dmaBuffer);
853 } else {
854 // No other combinations used in amdkfd v9
855 panic("query_status with interruptSel %d command %d not supported",
856 pkt->interruptSel, pkt->command);
857 }
858}
859
860void
862{
863 DPRINTF(PM4PacketProcessor, "PM4 query status complete\n");
864
865 delete pkt;
866 decodeNext(q);
867}
868
869void
871{
872 switch (mmio_offset) {
873 /* Hardware queue descriptor (HQD) registers */
874 case mmCP_HQD_VMID:
875 setHqdVmid(pkt->getLE<uint32_t>());
876 break;
877 case mmCP_HQD_ACTIVE:
878 setHqdActive(pkt->getLE<uint32_t>());
879 break;
880 case mmCP_HQD_PQ_BASE:
881 setHqdPqBase(pkt->getLE<uint32_t>());
882 break;
884 setHqdPqBaseHi(pkt->getLE<uint32_t>());
885 break;
887 setHqdPqDoorbellCtrl(pkt->getLE<uint32_t>());
888 gpuDevice->setDoorbellType(getKiqDoorbellOffset(), Compute, getIpId());
889 break;
890 case mmCP_HQD_PQ_RPTR:
891 setHqdPqPtr(pkt->getLE<uint32_t>());
892 break;
894 setHqdPqWptrLo(pkt->getLE<uint32_t>());
895 break;
897 setHqdPqWptrHi(pkt->getLE<uint32_t>());
898 break;
900 setHqdPqRptrReportAddr(pkt->getLE<uint32_t>());
901 break;
903 setHqdPqRptrReportAddrHi(pkt->getLE<uint32_t>());
904 break;
906 setHqdPqWptrPollAddr(pkt->getLE<uint32_t>());
907 break;
909 setHqdPqWptrPollAddrHi(pkt->getLE<uint32_t>());
910 break;
912 setHqdPqControl(pkt->getLE<uint32_t>());
913 break;
915 setHqdIbCtrl(pkt->getLE<uint32_t>());
916 break;
917 /* Ring buffer registers */
918 case mmCP_RB_VMID:
919 setRbVmid(pkt->getLE<uint32_t>());
920 break;
921 case mmCP_RB0_CNTL:
922 setRbCntl(pkt->getLE<uint32_t>());
923 break;
924 case mmCP_RB0_WPTR:
925 setRbWptrLo(pkt->getLE<uint32_t>());
926 break;
927 case mmCP_RB0_WPTR_HI:
928 setRbWptrHi(pkt->getLE<uint32_t>());
929 break;
931 setRbRptrAddrLo(pkt->getLE<uint32_t>());
932 break;
934 setRbRptrAddrHi(pkt->getLE<uint32_t>());
935 break;
937 setRbWptrPollAddrLo(pkt->getLE<uint32_t>());
938 break;
940 setRbWptrPollAddrHi(pkt->getLE<uint32_t>());
941 break;
942 case mmCP_RB0_BASE:
943 setRbBaseLo(pkt->getLE<uint32_t>());
944 break;
945 case mmCP_RB0_BASE_HI:
946 setRbBaseHi(pkt->getLE<uint32_t>());
947 break;
949 setRbDoorbellCntrl(pkt->getLE<uint32_t>());
950 gpuDevice->setDoorbellType(getPqDoorbellOffset(), Gfx, getIpId());
951 break;
953 setRbDoorbellRangeLo(pkt->getLE<uint32_t>());
954 break;
956 setRbDoorbellRangeHi(pkt->getLE<uint32_t>());
957 break;
958 default:
959 break;
960 }
961}
962
963void
965{
966 kiq.hqd_vmid = data;
967}
968
969void
971{
972 kiq.hqd_active = data;
973}
974
975void
977{
978 kiq.hqd_pq_base_lo = data;
979}
980
981void
983{
984 kiq.hqd_pq_base_hi = data;
985}
986
987void
989{
990 kiq.hqd_pq_doorbell_control = data;
991}
992
993void
995{
996 kiq.rptr = data;
997}
998
999void
1001{
1002 /* Write pointer communicated through doorbell value. */
1003}
1004
1005void
1007{
1008 /* Write pointer communicated through doorbell value. */
1009}
1010
1011void
1013{
1014 kiq.hqd_pq_rptr_report_addr_lo = data;
1015}
1016
1017void
1019{
1020 kiq.hqd_pq_rptr_report_addr_hi = data;
1021}
1022
1023void
1025{
1026 kiq.hqd_pq_wptr_poll_addr_lo = data;
1027}
1028
1029void
1031{
1032 kiq.hqd_pq_wptr_poll_addr_hi = data;
1033}
1034
1035void
1037{
1038 kiq.hqd_pq_control = data;
1039}
1040
1041void
1043{
1044 kiq.hqd_ib_control = data;
1045}
1046
1047void
1049{
1050 pq.hqd_vmid = data;
1051}
1052
1053void
1055{
1056 pq.hqd_pq_control = data;
1057}
1058
1059void
1061{
1062 pq.queueWptrLo = data;
1063}
1064
1065void
1067{
1068 pq.queueWptrHi = data;
1069}
1070
1071void
1073{
1074 pq.queueRptrAddrLo = data;
1075}
1076
1077void
1079{
1080 pq.queueRptrAddrHi = data;
1081}
1082
1083void
1085{
1086 pq.hqd_pq_wptr_poll_addr_lo = data;
1087}
1088
1089void
1091{
1092 pq.hqd_pq_wptr_poll_addr_hi = data;
1093}
1094
1095void
1097{
1098 pq.hqd_pq_base_lo = data;
1099}
1100
1101void
1103{
1104 pq.hqd_pq_base_hi = data;
1105}
1106
1107void
1109{
1110 pq.hqd_pq_doorbell_control = data;
1111 pq.doorbellOffset = data & 0x1ffffffc;
1112}
1113
1114void
1116{
1117 pq.doorbellRangeLo = data;
1118}
1119
1120void
1122{
1123 pq.doorbellRangeHi = data;
1124}
1125
1126void
1128{
1129 // Serialize the DmaVirtDevice base class
1131
1132 int num_queues = queues.size();
1133 auto id = std::make_unique<Addr[]>(num_queues);
1134 auto mqd_base = std::make_unique<Addr[]>(num_queues);
1135 auto mqd_read_index = std::make_unique<uint64_t[]>(num_queues);
1136 auto base = std::make_unique<Addr[]>(num_queues);
1137 auto rptr = std::make_unique<Addr[]>(num_queues);
1138 auto wptr = std::make_unique<Addr[]>(num_queues);
1139 auto ib_base = std::make_unique<Addr[]>(num_queues);
1140 auto ib_rptr = std::make_unique<Addr[]>(num_queues);
1141 auto ib_wptr = std::make_unique<Addr[]>(num_queues);
1142 auto offset = std::make_unique<Addr[]>(num_queues);
1143 auto processing = std::make_unique<bool[]>(num_queues);
1144 auto ib = std::make_unique<bool[]>(num_queues);
1145 auto me = std::make_unique<uint32_t[]>(num_queues);
1146 auto pipe = std::make_unique<uint32_t[]>(num_queues);
1147 auto queue = std::make_unique<uint32_t[]>(num_queues);
1148 auto privileged = std::make_unique<bool[]>(num_queues);
1149 auto queue_type = std::make_unique<uint32_t[]>(num_queues);
1150 auto hqd_active = std::make_unique<uint32_t[]>(num_queues);
1151 auto hqd_vmid = std::make_unique<uint32_t[]>(num_queues);
1152 auto aql_rptr = std::make_unique<Addr[]>(num_queues);
1153 auto aql = std::make_unique<uint32_t[]>(num_queues);
1154 auto doorbell = std::make_unique<uint32_t[]>(num_queues);
1155 auto hqd_pq_control = std::make_unique<uint32_t[]>(num_queues);
1156
1157 int i = 0;
1158 for (auto iter : queues) {
1159 PM4Queue *q = iter.second;
1160 id[i] = q->id();
1161 mqd_base[i] = q->mqdBase();
1162 mqd_read_index[i] = q->getMQD()->mqdReadIndex;
1163 bool cur_state = q->ib();
1164 q->ib(false);
1165 base[i] = q->base();
1166 rptr[i] = q->getRptr();
1167 wptr[i] = q->getWptr();
1168 q->ib(true);
1169 ib_base[i] = q->ibBase();
1170 ib_rptr[i] = q->getRptr();
1171 ib_wptr[i] = q->getWptr();
1172 q->ib(cur_state);
1173 offset[i] = q->offset();
1174 processing[i] = q->processing();
1175 ib[i] = q->ib();
1176 me[i] = q->me();
1177 pipe[i] = q->pipe();
1178 queue[i] = q->queue();
1179 privileged[i] = q->privileged();
1180 queue_type[i] = q->queueType();
1181 hqd_active[i] = q->getMQD()->hqd_active;
1182 hqd_vmid[i] = q->getMQD()->hqd_vmid;
1183 aql_rptr[i] = q->getMQD()->aqlRptr;
1184 aql[i] = q->getMQD()->aql;
1185 doorbell[i] = q->getMQD()->doorbell;
1186 hqd_pq_control[i] = q->getMQD()->hqd_pq_control;
1187 i++;
1188 }
1189
1190 SERIALIZE_SCALAR(num_queues);
1191 SERIALIZE_UNIQUE_PTR_ARRAY(id, num_queues);
1192 SERIALIZE_UNIQUE_PTR_ARRAY(mqd_base, num_queues);
1193 SERIALIZE_UNIQUE_PTR_ARRAY(mqd_read_index, num_queues);
1194 SERIALIZE_UNIQUE_PTR_ARRAY(base, num_queues);
1195 SERIALIZE_UNIQUE_PTR_ARRAY(rptr, num_queues);
1196 SERIALIZE_UNIQUE_PTR_ARRAY(wptr, num_queues);
1197 SERIALIZE_UNIQUE_PTR_ARRAY(ib_base, num_queues);
1198 SERIALIZE_UNIQUE_PTR_ARRAY(ib_rptr, num_queues);
1199 SERIALIZE_UNIQUE_PTR_ARRAY(ib_wptr, num_queues);
1201 SERIALIZE_UNIQUE_PTR_ARRAY(processing, num_queues);
1202 SERIALIZE_UNIQUE_PTR_ARRAY(ib, num_queues);
1203 SERIALIZE_UNIQUE_PTR_ARRAY(me, num_queues);
1204 SERIALIZE_UNIQUE_PTR_ARRAY(pipe, num_queues);
1205 SERIALIZE_UNIQUE_PTR_ARRAY(queue, num_queues);
1206 SERIALIZE_UNIQUE_PTR_ARRAY(privileged, num_queues);
1207 SERIALIZE_UNIQUE_PTR_ARRAY(queue_type, num_queues);
1208 SERIALIZE_UNIQUE_PTR_ARRAY(hqd_active, num_queues);
1209 SERIALIZE_UNIQUE_PTR_ARRAY(hqd_vmid, num_queues);
1210 SERIALIZE_UNIQUE_PTR_ARRAY(aql_rptr, num_queues);
1211 SERIALIZE_UNIQUE_PTR_ARRAY(aql, num_queues);
1212 SERIALIZE_UNIQUE_PTR_ARRAY(doorbell, num_queues);
1213 SERIALIZE_UNIQUE_PTR_ARRAY(hqd_pq_control, num_queues);
1214}
1215
1216void
1218{
1219 // Serialize the DmaVirtDevice base class
1221
1222 int num_queues = 0;
1223 UNSERIALIZE_SCALAR(num_queues);
1224
1225 auto id = std::make_unique<Addr[]>(num_queues);
1226 auto mqd_base = std::make_unique<Addr[]>(num_queues);
1227 auto mqd_read_index = std::make_unique<uint64_t[]>(num_queues);
1228 auto base = std::make_unique<Addr[]>(num_queues);
1229 auto rptr = std::make_unique<Addr[]>(num_queues);
1230 auto wptr = std::make_unique<Addr[]>(num_queues);
1231 auto ib_base = std::make_unique<Addr[]>(num_queues);
1232 auto ib_rptr = std::make_unique<Addr[]>(num_queues);
1233 auto ib_wptr = std::make_unique<Addr[]>(num_queues);
1234 auto offset = std::make_unique<Addr[]>(num_queues);
1235 auto processing = std::make_unique<bool[]>(num_queues);
1236 auto ib = std::make_unique<bool[]>(num_queues);
1237 auto me = std::make_unique<uint32_t[]>(num_queues);
1238 auto pipe = std::make_unique<uint32_t[]>(num_queues);
1239 auto queue = std::make_unique<uint32_t[]>(num_queues);
1240 auto privileged = std::make_unique<bool[]>(num_queues);
1241 auto queue_type = std::make_unique<uint32_t[]>(num_queues);
1242 auto hqd_active = std::make_unique<uint32_t[]>(num_queues);
1243 auto hqd_vmid = std::make_unique<uint32_t[]>(num_queues);
1244 auto aql_rptr = std::make_unique<Addr[]>(num_queues);
1245 auto aql = std::make_unique<uint32_t[]>(num_queues);
1246 auto doorbell = std::make_unique<uint32_t[]>(num_queues);
1247 auto hqd_pq_control = std::make_unique<uint32_t[]>(num_queues);
1248
1249 UNSERIALIZE_UNIQUE_PTR_ARRAY(id, num_queues);
1250 UNSERIALIZE_UNIQUE_PTR_ARRAY(mqd_base, num_queues);
1251 UNSERIALIZE_UNIQUE_PTR_ARRAY(mqd_read_index, num_queues);
1253 UNSERIALIZE_UNIQUE_PTR_ARRAY(rptr, num_queues);
1254 UNSERIALIZE_UNIQUE_PTR_ARRAY(wptr, num_queues);
1255 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib_base, num_queues);
1256 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib_rptr, num_queues);
1257 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib_wptr, num_queues);
1259 UNSERIALIZE_UNIQUE_PTR_ARRAY(processing, num_queues);
1260 UNSERIALIZE_UNIQUE_PTR_ARRAY(ib, num_queues);
1261 UNSERIALIZE_UNIQUE_PTR_ARRAY(me, num_queues);
1262 UNSERIALIZE_UNIQUE_PTR_ARRAY(pipe, num_queues);
1263 UNSERIALIZE_UNIQUE_PTR_ARRAY(queue, num_queues);
1264 UNSERIALIZE_UNIQUE_PTR_ARRAY(privileged, num_queues);
1265 UNSERIALIZE_UNIQUE_PTR_ARRAY(queue_type, num_queues);
1266 UNSERIALIZE_UNIQUE_PTR_ARRAY(hqd_active, num_queues);
1267 UNSERIALIZE_UNIQUE_PTR_ARRAY(hqd_vmid, num_queues);
1268 UNSERIALIZE_UNIQUE_PTR_ARRAY(aql_rptr, num_queues);
1269 UNSERIALIZE_UNIQUE_PTR_ARRAY(aql, num_queues);
1270 UNSERIALIZE_UNIQUE_PTR_ARRAY(doorbell, num_queues);
1271 UNSERIALIZE_UNIQUE_PTR_ARRAY(hqd_pq_control, num_queues);
1272
1273 for (int i = 0; i < num_queues; i++) {
1274 QueueDesc *mqd = new QueueDesc();
1275 memset(mqd, 0, sizeof(QueueDesc));
1276
1277 mqd->mqdBase = mqd_base[i] >> 8;
1278 mqd->mqdReadIndex = mqd_read_index[i];
1279 mqd->base = base[i] >> 8;
1280 mqd->aql = aql[i];
1281
1282 PM4MapQueues* pkt = new PM4MapQueues;
1283 memset(pkt, 0, sizeof(PM4MapQueues));
1284 newQueue(mqd, offset[i], pkt, id[i]);
1285
1286 if (ib[i]) {
1287 queues[id[i]]->wptr(ib_wptr[i]);
1288 queues[id[i]]->rptr(ib_rptr[i]);
1289 } else {
1290 queues[id[i]]->rptr(rptr[i]);
1291 queues[id[i]]->wptr(wptr[i]);
1292 }
1293 queues[id[i]]->ib(ib[i]);
1294 queues[id[i]]->offset(offset[i]);
1295 queues[id[i]]->processing(processing[i]);
1296 queues[id[i]]->setPkt(me[i], pipe[i], queue[i], privileged[i],
1297 queue_type[i]);
1298 queues[id[i]]->getMQD()->hqd_active = hqd_active[i];
1299 queues[id[i]]->getMQD()->hqd_vmid = hqd_vmid[i];
1300 queues[id[i]]->getMQD()->aqlRptr = aql_rptr[i];
1301 queues[id[i]]->getMQD()->doorbell = doorbell[i];
1302 queues[id[i]]->getMQD()->hqd_pq_control = hqd_pq_control[i];
1303
1304 if (mqd->aql) {
1305 int mqd_size = (1 << ((hqd_pq_control[i] & 0x3f) + 1)) * 4;
1306 auto &hsa_pp = gpuDevice->CP()->hsaPacketProc();
1307 hsa_pp.setDeviceQueueDesc(aql_rptr[i], base[i], id[i],
1308 mqd_size, 8, GfxVersion::gfx900, offset[i],
1309 mqd_read_index[i]);
1310 }
1311
1312 DPRINTF(PM4PacketProcessor, "PM4 queue %d, rptr: %p wptr: %p\n",
1313 queues[id[i]]->id(), queues[id[i]]->rptr(),
1314 queues[id[i]]->wptr());
1315 }
1316}
1317
1318} // namespace gem5
#define DPRINTF(x,...)
Definition trace.hh:209
const char data[]
Device model for an AMD GPU.
Translation range generators.
Definition amdgpu_vm.hh:342
void serialize(CheckpointOut &cp) const override
Serialize an object.
void unserialize(CheckpointIn &cp) override
Unserialize an object.
Wraps a std::function object in a DmaCallback.
void dmaReadVirt(Addr host_addr, unsigned size, DmaCallback *cb, void *data, Tick delay=0)
Initiate a DMA read from virtual address host_addr.
DmaVirtDevice(const Params &p)
void dmaWriteVirt(Addr host_addr, unsigned size, DmaCallback *b, void *data, Tick delay=0)
Initiate a DMA write from virtual address host_addr.
void writeMMIO(PacketPtr pkt, Addr mmio_offset)
void setRbWptrPollAddrLo(uint32_t data)
void decodeHeader(PM4Queue *q, PM4Header header)
This method calls other PM4 packet processing methods based on the header of a PM4 packet.
void unserialize(CheckpointIn &cp) override
Unserialize an object.
void mapKiq(Addr offset)
The first compute queue, the Kernel Interface Queueu a.k.a.
Addr getGARTAddr(Addr addr) const
void writeDataDone(PM4Queue *q, PM4WriteData *pkt, Addr addr)
void switchBuffer(PM4Queue *q, PM4SwitchBuf *pkt)
void setGPUDevice(AMDGPUDevice *gpu_device)
void serialize(CheckpointOut &cp) const override
Serialize an object.
std::unordered_map< uint32_t, PM4Queue * > queuesMap
void setUconfigReg(PM4Queue *q, PM4SetUconfigReg *pkt)
void queryStatus(PM4Queue *q, PM4QueryStatus *pkt)
void releaseMem(PM4Queue *q, PM4ReleaseMem *pkt)
void releaseMemDone(PM4Queue *q, PM4ReleaseMem *pkt, Addr addr)
void setHqdPqRptrReportAddr(uint32_t data)
void updateReadIndex(Addr offset, uint64_t rd_idx)
Update read index on doorbell rings.
void mapProcessV1(PM4Queue *q, PM4MapProcess *pkt)
void processSDMAMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, SDMAQueueDesc *mqd, uint16_t vmid)
void process(PM4Queue *q, Addr wptrOffset)
This method start processing a PM4Queue from the current read pointer to the newly communicated write...
void setHqdPqControl(uint32_t data)
void mapProcessV2(PM4Queue *q, PM4MapProcessV2 *pkt)
void setRbRptrAddrHi(uint32_t data)
void setHqdPqWptrPollAddr(uint32_t data)
void newQueue(QueueDesc *q, Addr offset, PM4MapQueues *pkt=nullptr, int id=-1)
This method creates a new PM4Queue based on a queue descriptor and an offset.
void unmapQueues(PM4Queue *q, PM4UnmapQueues *pkt)
void queryStatusDone(PM4Queue *q, PM4QueryStatus *pkt)
void mapProcess(uint32_t pasid, uint64_t ptBase, uint32_t shMemBases)
void setRbDoorbellRangeLo(uint32_t data)
void waitRegMem(PM4Queue *q, PM4WaitRegMem *pkt)
void setHqdPqBaseHi(uint32_t data)
void runList(PM4Queue *q, PM4RunList *pkt)
void decodeNext(PM4Queue *q)
This method decodes the next packet in a PM4Queue.
void mapPq(Addr offset)
The first graphics queue, the Primary Queueu a.k.a.
void writeData(PM4Queue *q, PM4WriteData *pkt, PM4Header header)
void setHqdPqDoorbellCtrl(uint32_t data)
void setRbDoorbellRangeHi(uint32_t data)
void doneMQDWrite(Addr mqdAddr, Addr addr)
std::unordered_map< uint16_t, PM4Queue * > queues
void indirectBuffer(PM4Queue *q, PM4IndirectBuf *pkt)
PM4PacketProcessor(const PM4PacketProcessorParams &p)
void unmapAllQueues(bool unmap_static)
void setHqdPqRptrReportAddrHi(uint32_t data)
void mapQueues(PM4Queue *q, PM4MapQueues *pkt)
TranslationGenPtr translate(Addr vaddr, Addr size) override
Method for functional translation.
void processMQD(PM4MapQueues *pkt, PM4Queue *q, Addr addr, QueueDesc *mqd, uint16_t vmid)
void setRbRptrAddrLo(uint32_t data)
void setRbDoorbellCntrl(uint32_t data)
PM4Queue * getQueue(Addr offset, bool gfx=false)
Based on an offset communicated through doorbell write, the PM4PacketProcessor identifies which queue...
AddrRangeList getAddrRanges() const override
Every PIO device is obliged to provide an implementation that returns the address ranges the device r...
void setHqdPqWptrPollAddrHi(uint32_t data)
void setRbWptrPollAddrHi(uint32_t data)
Class defining a PM4 queue.
T getLE() const
Get the data in the packet byte swapped from little endian to host endian.
System DMA Engine class for AMD dGPU.
void registerRLCQueue(Addr doorbell, Addr mqdAddr, SDMAQueueDesc *mqd, bool isStatic)
Methods for RLC queues.
The GPUCommandProcessor (CP) is responsible for accepting commands, in the form of HSA AQL packets,...
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64
constexpr T bits(T val, unsigned first, unsigned last)
Extract the bitfield from position 'first' to 'last' (inclusive) from 'val' and right justify it.
Definition bitfield.hh:79
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232
#define UNSERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Definition serialize.hh:634
#define SERIALIZE_UNIQUE_PTR_ARRAY(member, size)
Definition serialize.hh:626
#define warn(...)
Definition logging.hh:288
Bitfield< 27 > q
Definition misc_types.hh:55
Bitfield< 7 > i
Definition misc_types.hh:67
Bitfield< 23, 0 > offset
Definition types.hh:144
Bitfield< 33 > id
Bitfield< 0 > p
Bitfield< 12 > me
Definition misc.hh:118
Bitfield< 3 > addr
Definition types.hh:84
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36
struct gem5::GEM5_PACKED PM4WriteData
struct gem5::GEM5_PACKED PM4WaitRegMem
std::ostream CheckpointOut
Definition serialize.hh:66
struct gem5::GEM5_PACKED PM4RunList
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147
@ SOC15_IH_CLIENTID_GRBM_CP
struct gem5::GEM5_PACKED PM4ReleaseMem
struct gem5::GEM5_PACKED PM4SwitchBuf
Bitfield< 10 > pasid
Definition x86_cpu.cc:129
struct gem5::GEM5_PACKED PM4Header
PM4 packets.
struct gem5::GEM5_PACKED PM4MapQueues
struct gem5::GEM5_PACKED PM4MapProcess
Packet * PacketPtr
struct gem5::GEM5_PACKED PM4MapProcessV2
struct gem5::GEM5_PACKED SDMAQueueDesc
Queue descriptor for SDMA-based user queues (RLC queues).
struct gem5::GEM5_PACKED PM4UnmapQueues
std::unique_ptr< TranslationGen > TranslationGenPtr
struct gem5::GEM5_PACKED PM4SetUconfigReg
@ IT_RELEASE_MEM
@ IT_WRITE_DATA
@ IT_RUN_LIST
@ IT_MAP_QUEUES
@ IT_SET_UCONFIG_REG
@ IT_MAP_PROCESS
@ IT_INVALIDATE_TLBS
@ IT_QUERY_STATUS
@ IT_WAIT_REG_MEM
@ IT_UNMAP_QUEUES
@ IT_INDIRECT_BUFFER
@ IT_SWITCH_BUFFER
struct gem5::GEM5_PACKED PM4QueryStatus
struct gem5::GEM5_PACKED QueueDesc
Queue descriptor with relevant MQD attributes.
struct gem5::GEM5_PACKED PM4IndirectBuf
@ HW_REG_SH_MEM_BASES
output header
Definition nop.cc:36
Declaration of the Packet class.
#define PACKET3_SET_UCONFIG_REG_START
Value from vega10/pm4_header.h.
#define mmCP_RB_DOORBELL_CONTROL
Definition pm4_mmio.hh:48
#define mmCP_RB0_RPTR_ADDR_HI
Definition pm4_mmio.hh:45
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR
Definition pm4_mmio.hh:59
#define mmCP_RB0_BASE_HI
Definition pm4_mmio.hh:51
#define mmCP_HQD_PQ_DOORBELL_CONTROL
Definition pm4_mmio.hh:57
#define mmCP_HQD_PQ_WPTR_POLL_ADDR
Definition pm4_mmio.hh:61
#define mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI
Definition pm4_mmio.hh:60
#define mmCP_HQD_PQ_BASE
Definition pm4_mmio.hh:55
#define mmCP_RB_DOORBELL_RANGE_UPPER
Definition pm4_mmio.hh:50
#define mmCP_HQD_IB_CONTROL
Definition pm4_mmio.hh:64
#define mmCP_RB0_BASE
Definition pm4_mmio.hh:39
#define mmCP_HQD_VMID
Definition pm4_mmio.hh:54
#define mmCP_RB_WPTR_POLL_ADDR_LO
Definition pm4_mmio.hh:41
#define mmCP_HQD_PQ_RPTR
Definition pm4_mmio.hh:58
#define mmCP_HQD_ACTIVE
Definition pm4_mmio.hh:53
#define mmCP_RB_VMID
Definition pm4_mmio.hh:43
#define mmCP_HQD_PQ_BASE_HI
Definition pm4_mmio.hh:56
#define mmCP_RB0_WPTR_HI
Definition pm4_mmio.hh:47
#define mmCP_HQD_PQ_WPTR_HI
Definition pm4_mmio.hh:66
#define mmCP_HQD_PQ_CONTROL
Definition pm4_mmio.hh:63
#define mmCP_RB_DOORBELL_RANGE_LOWER
Definition pm4_mmio.hh:49
#define mmCP_RB_WPTR_POLL_ADDR_HI
Definition pm4_mmio.hh:42
#define mmCP_RB0_CNTL
Definition pm4_mmio.hh:40
#define mmCP_RB0_RPTR_ADDR
Definition pm4_mmio.hh:44
#define mmCP_HQD_PQ_WPTR_POLL_ADDR_HI
Definition pm4_mmio.hh:62
#define mmCP_RB0_WPTR
Definition pm4_mmio.hh:46
#define mmCP_HQD_PQ_WPTR_LO
Definition pm4_mmio.hh:65
#define UNSERIALIZE_SCALAR(scalar)
Definition serialize.hh:575
#define SERIALIZE_SCALAR(scalar)
Definition serialize.hh:568
uint32_t sdmax_rlcx_ib_base_lo
uint32_t sdmax_rlcx_rb_rptr
uint32_t doorbellOffset0
uint32_t sdmax_rlcx_rb_rptr_addr_hi
uint32_t sdmax_rlcx_rb_cntl
uint32_t doorbellOffset3
uint32_t sdmax_rlcx_rb_wptr_hi
uint32_t doorbellOffset2
uint32_t sdmax_rlcx_ib_base_hi
uint32_t doorbellOffset
uint32_t processQuantum
uint32_t hqd_pq_control
uint32_t hqd_active
Definition pm4_queues.hh:96
uint32_t sdmax_rlcx_rb_rptr_addr_lo
uint32_t sdmax_rlcx_rb_wptr
uint32_t sdmax_rlcx_rb_rptr_hi
uint64_t mqdReadIndex
Definition pm4_queues.hh:55
uint32_t doorbellOffset1
uint64_t completionSignal

Generated on Mon May 26 2025 09:19:09 for gem5 by doxygen 1.13.2