37#include "debug/GPUFetch.hh"
38#include "debug/GPUPort.hh"
39#include "debug/GPUTLB.hh"
53 : timingSim(true), computeUnit(cu), fetchScheduler(
p),
54 waveList(nullptr), fetchDepth(
p.fetch_depth)
95 if (!fetch_buf.hasFreeSpace()) {
96 fetch_buf.checkWaveReleaseBuf();
98 if (fetch_buf.hasFetchDataToProcess()) {
99 fetch_buf.decodeInsts();
157 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Id%d: Initiate fetch "
161 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
184 DPRINTF(GPUTLB,
"Failed to send TLB req for FETCH addr %#x\n",
195 DPRINTF(GPUTLB,
"Failed to send TLB req for FETCH addr %#x\n",
200 DPRINTF(GPUTLB,
"sent FETCH translation request for %#x\n",
vaddr);
214 if (!pkt->
req->systemReq()) {
225 fetch(pkt, wavefront);
232 assert(pkt->
req->hasPaddr());
233 assert(pkt->
req->hasSize());
235 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Fetch Access: %#x\n",
237 pkt->
req->getPaddr());
266 if (!pkt->
req->systemReq()) {
276 .reservedBuf(pkt->
req->getVaddr()));
284 if (pkt->
req->systemReq()) {
292 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
294 pkt->
req->getPaddr());
296 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
298 pkt->
req->getPaddr());
310 safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->
senderState);
314 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Fetch addr %#x returned "
357 "Cache line size should be a power of two.");
372 restartFromBranch =
true;
383 freeList.push_back(bufStart +
i * cacheLineSize);
386 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
387 "buffer\n", wavefront->simdId, wavefront->wfSlotId,
396 if (bufferedAndReservedLines()) {
397 Addr last_line_fetched = 0;
398 if (!reservedLines()) {
403 last_line_fetched = bufferedPCs.rbegin()->first;
405 last_line_fetched = reservedPCs.rbegin()->first;
408 next_line = last_line_fetched + cacheLineSize;
414 assert(bufferedPCs.find(next_line) == bufferedPCs.end());
415 assert(reservedPCs.find(next_line) == reservedPCs.end());
434 if (restartFromBranch) {
435 restartFromBranch =
false;
439 readPtr += byte_offset;
451 assert(hasFreeSpace());
452 assert(bufferedPCs.find(
vaddr) == bufferedPCs.end());
453 assert(reservedPCs.find(
vaddr) == reservedPCs.end());
454 assert(bufferedAndReservedLines() <
fetchDepth);
456 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d reserved fetch buffer entry "
457 "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
458 wavefront->wfDynId,
vaddr);
466 uint8_t *inst_buf = freeList.front();
467 reservedPCs.emplace(
vaddr, inst_buf);
468 freeList.pop_front();
474 assert(bufferedPCs.find(
vaddr) == bufferedPCs.end());
475 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d done fetching for addr %#x\n",
476 wavefront->simdId, wavefront->wfSlotId,
477 wavefront->wfDynId,
vaddr);
484 auto reserved_pc = reservedPCs.find(
vaddr);
485 assert(reserved_pc != reservedPCs.end());
486 bufferedPCs.emplace(
vaddr, reserved_pc->second);
488 if (readPtr == bufEnd) {
492 reserved_pc->second =
nullptr;
493 reservedPCs.erase(reserved_pc);
499 return fetchBytesRemaining() >=
sizeof(TheGpuISA::RawMachInst);
506 wavefront->computeUnit->cacheLineSize());
507 if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
508 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d current wave PC(%#x) still "
509 "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
510 wavefront->wfDynId, cur_wave_pc);
513 assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
518 auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
519 auto oldest_buffered_pc = bufferedPCs.begin();
521 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d checking if PC block addr = %#x"
522 "(PC = %#x) can be released.\n", wavefront->simdId,
523 wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
528 for (
const auto &buf_pc : bufferedPCs) {
529 DPRINTF(GPUFetch,
"PC[%d] = %#x\n", idx, buf_pc.first);
536 assert(current_buffered_pc != bufferedPCs.end());
544 if (current_buffered_pc != oldest_buffered_pc) {
545 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d done fetching for PC = %#x, "
546 "removing it from the fetch buffer.\n", wavefront->simdId,
547 wavefront->wfSlotId, wavefront->wfDynId,
548 oldest_buffered_pc->first);
550 freeList.emplace_back(oldest_buffered_pc->second);
551 oldest_buffered_pc->second =
nullptr;
552 bufferedPCs.erase(oldest_buffered_pc);
553 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d has %d lines buffered.\n",
554 wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
568 while (wavefront->instructionBuffer.size() < maxIbSize
569 && hasFetchDataToProcess()) {
573 TheGpuISA::MachInst mach_inst
574 =
reinterpret_cast<TheGpuISA::MachInst
>(readPtr);
575 GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
576 readPtr += gpu_static_inst->
instSize();
578 assert(readPtr <= bufEnd);
581 = std::make_shared<GPUDynInst>(wavefront->computeUnit,
582 wavefront, gpu_static_inst,
583 wavefront->computeUnit->
585 wavefront->instructionBuffer.push_back(gpu_dyn_inst);
587 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%ld decoded %s (%d bytes). "
588 "%d bytes remain.\n", wavefront->simdId,
589 wavefront->wfSlotId, wavefront->wfDynId,
592 fetchBytesRemaining());
600 TheGpuISA::RawMachInst split_inst = 0;
601 int dword_size =
sizeof(uint32_t);
602 int num_dwords =
sizeof(TheGpuISA::RawMachInst) / dword_size;
604 for (
int i = 0;
i < num_dwords; ++
i) {
606 *
reinterpret_cast<uint32_t*
>(readPtr));
607 if (readPtr + dword_size >= bufEnd) {
612 assert(readPtr == bufStart);
614 TheGpuISA::MachInst mach_inst
615 =
reinterpret_cast<TheGpuISA::MachInst
>(&split_inst);
616 GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
617 readPtr += (gpu_static_inst->
instSize() - dword_size);
618 assert(readPtr < bufEnd);
621 = std::make_shared<GPUDynInst>(wavefront->computeUnit,
622 wavefront, gpu_static_inst,
623 wavefront->computeUnit->
625 wavefront->instructionBuffer.push_back(gpu_dyn_inst);
627 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d decoded split inst %s (%#x) "
628 "(%d bytes). %d bytes remain in %d buffered lines.\n",
629 wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
631 gpu_static_inst->
instSize(), fetchBytesRemaining(),
642 bool is_split = (readPtr +
sizeof(TheGpuISA::RawMachInst)) > bufEnd;
650 int bytes_remaining = 0;
652 if (bufferedLines() && readPtr != bufEnd) {
653 auto last_buf_pc = bufferedPCs.rbegin();
654 uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
655 int byte_diff = end_ptr - readPtr;
657 if (end_ptr > readPtr) {
658 bytes_remaining = byte_diff;
659 }
else if (end_ptr < readPtr) {
660 bytes_remaining = bufferedBytes() + byte_diff;
664 assert(bytes_remaining <= bufferedBytes());
665 return bytes_remaining;
671 reqPkt->makeResponse();
672 fetchUnit->computeUnit.handleSQCReturn(reqPkt);
void sendRequest(PacketPtr pkt, Event *callback)
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
int getCacheLineBits() const
int cacheLineSize() const
RequestorID requestorId()
void fetchDone(Addr vaddr)
void reserveBuf(Addr vaddr)
reserve an entry in the fetch buffer for PC = vaddr,
uint8_t * readPtr
pointer that points to the next chunk of inst data to be decoded.
int fetchBytesRemaining() const
calculates the number of fetched bytes that have yet to be decoded.
void checkWaveReleaseBuf()
checks if the wavefront can release any of its fetch buffer entries.
bool hasFetchDataToProcess() const
checks if the buffer contains valid data.
void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf)
allocate the fetch buffer space, and set the fetch depth (number of lines that may be buffered),...
bool splitDecode() const
check if the next instruction to be processed out of the fetch buffer is split across the end/beginni...
void decodeInsts()
each time the fetch stage is ticked, we check if there are any data in the fetch buffer that may be d...
uint8_t * bufStart
raw instruction buffer.
std::deque< uint8_t * > freeList
represents the fetch buffer free list.
static uint32_t globalFetchUnitID
std::vector< Wavefront * > * waveList
void bindWaveList(std::vector< Wavefront * > *list)
FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu)
void fetch(PacketPtr pkt, Wavefront *wavefront)
std::vector< Wavefront * > fetchQueue
void initiateFetch(Wavefront *wavefront)
int fetchDepth
number of cache lines we can fetch and buffer.
TheGpuISA::Decoder decoder
ComputeUnit & computeUnit
void processFetchReturn(PacketPtr pkt)
std::vector< FetchBufDesc > fetchBuf
void flushBuf(int wfSlotId)
std::vector< std::pair< Wavefront *, bool > > fetchStatusQueue
const std::string & disassemble()
virtual int instSize() const =0
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
@ INST_FETCH
The request was an instruction fetch.
void bindList(std::vector< Wavefront * > *sched_list)
AMDGPUSystemHub * systemHub
std::deque< GPUDynInstPtr > instructionBuffer
@ S_WAITCNT
wavefront has unsatisfied wait counts
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Addr makeLineAddress(Addr addr)
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...