37#include "debug/GPUFetch.hh"
38#include "debug/GPUPort.hh"
39#include "debug/GPUTLB.hh"
96 if (!fetch_buf.hasFreeSpace()) {
97 fetch_buf.checkWaveReleaseBuf();
99 if (fetch_buf.hasFetchDataToProcess()) {
100 fetch_buf.decodeInsts();
105 for (
int j = 0; j <
computeUnit.shader->n_wf; ++j) {
158 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Id%d: Initiate fetch "
162 DPRINTF(GPUTLB,
"CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
185 DPRINTF(GPUTLB,
"Failed to send TLB req for FETCH addr %#x\n",
189 }
else if (!
computeUnit.sqcTLBPort.sendTimingReq(pkt)) {
196 DPRINTF(GPUTLB,
"Failed to send TLB req for FETCH addr %#x\n",
201 DPRINTF(GPUTLB,
"sent FETCH translation request for %#x\n",
vaddr);
215 if (!pkt->
req->systemReq()) {
226 fetch(pkt, wavefront);
233 assert(pkt->
req->hasPaddr());
234 assert(pkt->
req->hasSize());
236 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Fetch Access: %#x\n",
238 pkt->
req->getPaddr());
267 if (!pkt->
req->systemReq()) {
277 .reservedBuf(pkt->
req->getVaddr()));
285 if (pkt->
req->systemReq()) {
288 computeUnit.shader->systemHub->sendRequest(pkt, resp_event);
289 }
else if (!
computeUnit.sqcPort.sendTimingReq(pkt)) {
290 computeUnit.sqcPort.retries.push_back(std::make_pair(pkt,
293 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
295 pkt->
req->getPaddr());
297 DPRINTF(GPUPort,
"CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
299 pkt->
req->getPaddr());
315 DPRINTF(GPUFetch,
"CU%d: WF[%d][%d]: Fetch addr %#x returned "
358 "Cache line size should be a power of two.");
387 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d Fetch dropped, flushing fetch "
398 Addr last_line_fetched = 0;
457 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d reserved fetch buffer entry "
467 uint8_t *inst_buf =
freeList.front();
491 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d done fetching for addr %#x\n",
508 reserved_pc->second =
nullptr;
522 wavefront->computeUnit->cacheLineSize());
524 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d current wave PC(%#x) still "
534 auto current_buffered_pc =
bufferedPCs.find(cur_wave_pc);
537 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d checking if PC block addr = %#x"
538 "(PC = %#x) can be released.\n",
wavefront->simdId,
545 DPRINTF(GPUFetch,
"PC[%d] = %#x\n", idx, buf_pc.first);
560 if (current_buffered_pc != oldest_buffered_pc) {
561 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d done fetching for PC = %#x, "
562 "removing it from the fetch buffer.\n",
wavefront->simdId,
564 oldest_buffered_pc->first);
566 freeList.emplace_back(oldest_buffered_pc->second);
567 oldest_buffered_pc->second =
nullptr;
569 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d has %d lines buffered.\n",
589 TheGpuISA::MachInst mach_inst
590 =
reinterpret_cast<TheGpuISA::MachInst
>(
readPtr);
597 = std::make_shared<GPUDynInst>(
wavefront->computeUnit,
601 wavefront->instructionBuffer.push_back(gpu_dyn_inst);
603 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%ld decoded %s (%d bytes). "
616 TheGpuISA::RawMachInst split_inst = 0;
617 int dword_size =
sizeof(uint32_t);
618 int num_dwords =
sizeof(TheGpuISA::RawMachInst) / dword_size;
620 for (
int i = 0;
i < num_dwords; ++
i) {
622 *
reinterpret_cast<uint32_t*
>(
readPtr));
630 TheGpuISA::MachInst mach_inst
631 =
reinterpret_cast<TheGpuISA::MachInst
>(&split_inst);
637 = std::make_shared<GPUDynInst>(
wavefront->computeUnit,
641 wavefront->instructionBuffer.push_back(gpu_dyn_inst);
643 DPRINTF(GPUFetch,
"WF[%d][%d]: Id%d decoded split inst %s (%#x) "
644 "(%d bytes). %d bytes remain in %d buffered lines.\n",
658 bool is_split = (
readPtr +
sizeof(TheGpuISA::RawMachInst)) >
bufEnd;
666 int bytes_remaining = 0;
671 int byte_diff = end_ptr -
readPtr;
674 bytes_remaining = byte_diff;
675 }
else if (end_ptr <
readPtr) {
681 return bytes_remaining;
int reservedLines() const
std::map< Addr, uint8_t * > reservedPCs
void reserveBuf(Addr vaddr)
reserve an entry in the fetch buffer for PC = vaddr,
uint8_t * readPtr
pointer that points to the next chunk of inst data to be decoded.
int fetchBytesRemaining() const
calculates the number of fetched bytes that have yet to be decoded.
void checkWaveReleaseBuf()
checks if the wavefront can release any of its fetch buffer entries.
bool hasFetchDataToProcess() const
checks if the buffer contains valid data.
std::map< Addr, uint8_t * > bufferedPCs
the set of PCs (fetch addresses) that are currently buffered.
int bufferedAndReservedLines() const
void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf)
allocate the fetch buffer space, and set the fetch depth (number of lines that may be buffered),...
int bufferedLines() const
bool hasFreeSpace() const
void fetchDone(PacketPtr ptr)
bool splitDecode() const
check if the next instruction to be processed out of the fetch buffer is split across the end/beginni...
void decodeInsts()
each time the fetch stage is ticked, we check if there are any data in the fetch buffer that may be d...
uint8_t * bufStart
raw instruction buffer.
TheGpuISA::Decoder * _decoder
int bufferedBytes() const
std::deque< uint8_t * > freeList
represents the fetch buffer free list.
static uint32_t globalFetchUnitID
std::vector< Wavefront * > * waveList
void bindWaveList(std::vector< Wavefront * > *list)
FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu)
void fetch(PacketPtr pkt, Wavefront *wavefront)
std::vector< Wavefront * > fetchQueue
void initiateFetch(Wavefront *wavefront)
int fetchDepth
number of cache lines we can fetch and buffer.
TheGpuISA::Decoder decoder
ComputeUnit & computeUnit
void processFetchReturn(PacketPtr pkt)
std::vector< FetchBufDesc > fetchBuf
void flushBuf(int wfSlotId)
std::vector< std::pair< Wavefront *, bool > > fetchStatusQueue
const std::string & disassemble()
virtual int instSize() const =0
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
@ INST_FETCH
The request was an instruction fetch.
std::deque< GPUDynInstPtr > instructionBuffer
@ S_WAITCNT
wavefront has unsatisfied wait counts
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Addr makeLineAddress(Addr addr, int cacheLineBits)
Copyright (c) 2024 Arm Limited All rights reserved.
T safe_cast(U &&ref_or_ptr)
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
SenderState is information carried along with the packet throughout the TLB hierarchy.
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...