37#include "debug/GPUFetch.hh"
38#include "debug/GPUPort.hh"
39#include "debug/GPUTLB.hh"
53 : timingSim(true), computeUnit(cu), fetchScheduler(
p),
54 waveList(nullptr), fetchDepth(
p.fetch_depth)
95 if (!fetch_buf.hasFreeSpace()) {
96 fetch_buf.checkWaveReleaseBuf();
98 if (fetch_buf.hasFetchDataToProcess()) {
99 fetch_buf.decodeInsts();
157 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Id%d: Initiate fetch "
161 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
184 DPRINTF(GPUTLB,
"Failed to send TLB req for FETCH addr %#x\n",
195 DPRINTF(GPUTLB,
"Failed to send TLB req for FETCH addr %#x\n",
200 DPRINTF(GPUTLB,
"sent FETCH translation request for %#x\n",
vaddr);
214 if (!pkt->
req->systemReq()) {
225 fetch(pkt, wavefront);
232 assert(pkt->
req->hasPaddr());
233 assert(pkt->
req->hasSize());
235 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Fetch Access: %#x\n",
237 pkt->
req->getPaddr());
266 if (!pkt->
req->systemReq()) {
276 .reservedBuf(pkt->
req->getVaddr()));
284 if (pkt->
req->systemReq()) {
292 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
294 pkt->
req->getPaddr());
296 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
298 pkt->
req->getPaddr());
314 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Fetch addr %#x returned "
357 "Cache line size should be a power of two.");
372 restartFromBranch =
true;
383 freeList.push_back(bufStart +
i * cacheLineSize);
386 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
387 "buffer\n", wavefront->simdId, wavefront->wfSlotId,
396 if (bufferedAndReservedLines()) {
397 Addr last_line_fetched = 0;
398 if (!reservedLines()) {
403 last_line_fetched = bufferedPCs.rbegin()->first;
405 last_line_fetched = reservedPCs.rbegin()->first;
408 next_line = last_line_fetched + cacheLineSize;
414 assert(bufferedPCs.find(next_line) == bufferedPCs.end());
415 assert(reservedPCs.find(next_line) == reservedPCs.end());
434 if (restartFromBranch) {
435 restartFromBranch =
false;
439 readPtr += byte_offset;
451 assert(hasFreeSpace());
452 assert(bufferedPCs.find(
vaddr) == bufferedPCs.end());
453 assert(reservedPCs.find(
vaddr) == reservedPCs.end());
454 assert(bufferedAndReservedLines() <
fetchDepth);
456 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d reserved fetch buffer entry "
457 "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
458 wavefront->wfDynId,
vaddr);
466 uint8_t *inst_buf = freeList.front();
467 reservedPCs.emplace(
vaddr, inst_buf);
468 freeList.pop_front();
481 wavefront->decLGKMInstsIssued();
483 restartFromBranch =
false;
489 assert(bufferedPCs.find(
vaddr) == bufferedPCs.end());
490 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d done fetching for addr %#x\n",
491 wavefront->simdId, wavefront->wfSlotId,
492 wavefront->wfDynId,
vaddr);
499 auto reserved_pc = reservedPCs.find(
vaddr);
500 assert(reserved_pc != reservedPCs.end());
501 bufferedPCs.emplace(
vaddr, reserved_pc->second);
503 if (readPtr == bufEnd) {
507 reserved_pc->second =
nullptr;
508 reservedPCs.erase(reserved_pc);
514 return fetchBytesRemaining() >=
sizeof(TheGpuISA::RawMachInst);
521 wavefront->computeUnit->cacheLineSize());
522 if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
523 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d current wave PC(%#x) still "
524 "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
525 wavefront->wfDynId, cur_wave_pc);
528 assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
533 auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
534 auto oldest_buffered_pc = bufferedPCs.begin();
536 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d checking if PC block addr = %#x"
537 "(PC = %#x) can be released.\n", wavefront->simdId,
538 wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
543 for (
const auto &buf_pc : bufferedPCs) {
544 DPRINTF(GPUFetch,
"PC[%d] = %#x\n", idx, buf_pc.first);
551 assert(current_buffered_pc != bufferedPCs.end());
559 if (current_buffered_pc != oldest_buffered_pc) {
560 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d done fetching for PC = %#x, "
561 "removing it from the fetch buffer.\n", wavefront->simdId,
562 wavefront->wfSlotId, wavefront->wfDynId,
563 oldest_buffered_pc->first);
565 freeList.emplace_back(oldest_buffered_pc->second);
566 oldest_buffered_pc->second =
nullptr;
567 bufferedPCs.erase(oldest_buffered_pc);
568 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d has %d lines buffered.\n",
569 wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
583 while (wavefront->instructionBuffer.size() < maxIbSize
584 && hasFetchDataToProcess()) {
588 TheGpuISA::MachInst mach_inst
589 =
reinterpret_cast<TheGpuISA::MachInst
>(readPtr);
590 GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
591 readPtr += gpu_static_inst->
instSize();
593 assert(readPtr <= bufEnd);
596 = std::make_shared<GPUDynInst>(wavefront->computeUnit,
597 wavefront, gpu_static_inst,
598 wavefront->computeUnit->
600 wavefront->instructionBuffer.push_back(gpu_dyn_inst);
602 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%ld decoded %s (%d bytes). "
603 "%d bytes remain.\n", wavefront->simdId,
604 wavefront->wfSlotId, wavefront->wfDynId,
607 fetchBytesRemaining());
615 TheGpuISA::RawMachInst split_inst = 0;
616 int dword_size =
sizeof(uint32_t);
617 int num_dwords =
sizeof(TheGpuISA::RawMachInst) / dword_size;
619 for (
int i = 0;
i < num_dwords; ++
i) {
621 *
reinterpret_cast<uint32_t*
>(readPtr));
622 if (readPtr + dword_size >= bufEnd) {
627 assert(readPtr == bufStart);
629 TheGpuISA::MachInst mach_inst
630 =
reinterpret_cast<TheGpuISA::MachInst
>(&split_inst);
631 GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
632 readPtr += (gpu_static_inst->
instSize() - dword_size);
633 assert(readPtr < bufEnd);
636 = std::make_shared<GPUDynInst>(wavefront->computeUnit,
637 wavefront, gpu_static_inst,
638 wavefront->computeUnit->
640 wavefront->instructionBuffer.push_back(gpu_dyn_inst);
642 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d decoded split inst %s (%#x) "
643 "(%d bytes). %d bytes remain in %d buffered lines.\n",
644 wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
646 gpu_static_inst->
instSize(), fetchBytesRemaining(),
657 bool is_split = (readPtr +
sizeof(TheGpuISA::RawMachInst)) > bufEnd;
665 int bytes_remaining = 0;
667 if (bufferedLines() && readPtr != bufEnd) {
668 auto last_buf_pc = bufferedPCs.rbegin();
669 uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
670 int byte_diff = end_ptr - readPtr;
672 if (end_ptr > readPtr) {
673 bytes_remaining = byte_diff;
674 }
else if (end_ptr < readPtr) {
675 bytes_remaining = bufferedBytes() + byte_diff;
679 assert(bytes_remaining <= bufferedBytes());
680 return bytes_remaining;
686 reqPkt->makeResponse();
687 fetchUnit->computeUnit.handleSQCReturn(reqPkt);
void sendRequest(PacketPtr pkt, Event *callback)
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
int getCacheLineBits() const
int cacheLineSize() const
RequestorID requestorId()
void reserveBuf(Addr vaddr)
reserve an entry in the fetch buffer for PC = vaddr,
uint8_t * readPtr
pointer that points to the next chunk of inst data to be decoded.
int fetchBytesRemaining() const
calculates the number of fetched bytes that have yet to be decoded.
void checkWaveReleaseBuf()
checks if the wavefront can release any of its fetch buffer entries.
bool hasFetchDataToProcess() const
checks if the buffer contains valid data.
void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf)
allocate the fetch buffer space, and set the fetch depth (number of lines that may be buffered),...
void fetchDone(PacketPtr ptr)
bool splitDecode() const
check if the next instruction to be processed out of the fetch buffer is split across the end/beginni...
void decodeInsts()
each time the fetch stage is ticked, we check if there are any data in the fetch buffer that may be d...
uint8_t * bufStart
raw instruction buffer.
std::deque< uint8_t * > freeList
represents the fetch buffer free list.
static uint32_t globalFetchUnitID
std::vector< Wavefront * > * waveList
void bindWaveList(std::vector< Wavefront * > *list)
FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu)
void fetch(PacketPtr pkt, Wavefront *wavefront)
std::vector< Wavefront * > fetchQueue
void initiateFetch(Wavefront *wavefront)
int fetchDepth
number of cache lines we can fetch and buffer.
TheGpuISA::Decoder decoder
ComputeUnit & computeUnit
void processFetchReturn(PacketPtr pkt)
std::vector< FetchBufDesc > fetchBuf
void flushBuf(int wfSlotId)
std::vector< std::pair< Wavefront *, bool > > fetchStatusQueue
const std::string & disassemble()
virtual int instSize() const =0
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
@ INST_FETCH
The request was an instruction fetch.
void bindList(std::vector< Wavefront * > *sched_list)
AMDGPUSystemHub * systemHub
std::deque< GPUDynInstPtr > instructionBuffer
@ S_WAITCNT
wavefront has unsatisfied wait counts
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Addr makeLineAddress(Addr addr)
Copyright (c) 2024 - Pranith Kumar Copyright (c) 2020 Inria All rights reserved.
T safe_cast(U &&ref_or_ptr)
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...