37 #include "debug/GPUFetch.hh" 
   38 #include "debug/GPUPort.hh" 
   39 #include "debug/GPUTLB.hh" 
   53     : timingSim(true), computeUnit(cu), fetchScheduler(
p),
 
   54       waveList(nullptr), fetchDepth(
p.fetch_depth)
 
   95         if (!fetch_buf.hasFreeSpace()) {
 
   96             fetch_buf.checkWaveReleaseBuf();
 
   98         if (fetch_buf.hasFetchDataToProcess()) {
 
   99             fetch_buf.decodeInsts();
 
  157     DPRINTF(GPUFetch, 
"CU%d: WF[%d][%d]: Id%d: Initiate fetch " 
  161     DPRINTF(GPUTLB, 
"CU%d: WF[%d][%d]: Initiating fetch translation: %#x\n",
 
  184             DPRINTF(GPUTLB, 
"Failed to send TLB req for FETCH addr %#x\n",
 
  195             DPRINTF(GPUTLB, 
"Failed to send TLB req for FETCH addr %#x\n",
 
  200             DPRINTF(GPUTLB, 
"sent FETCH translation request for %#x\n", 
vaddr);
 
  214         if (!pkt->
req->systemReq()) {
 
  225         fetch(pkt, wavefront);
 
  232     assert(pkt->
req->hasPaddr());
 
  233     assert(pkt->
req->hasSize());
 
  235     DPRINTF(GPUFetch, 
"CU%d: WF[%d][%d]: Fetch Access: %#x\n",
 
  237             pkt->
req->getPaddr());
 
  266     if (!pkt->
req->systemReq()) {
 
  276                     .reservedBuf(pkt->
req->getVaddr()));
 
  284         if (pkt->
req->systemReq()) {
 
  292             DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: Fetch addr %#x failed!\n",
 
  294                     pkt->
req->getPaddr());
 
  296             DPRINTF(GPUPort, 
"CU%d: WF[%d][%d]: Fetch addr %#x sent!\n",
 
  298                     pkt->
req->getPaddr());
 
  310         safe_cast<ComputeUnit::SQCPort::SenderState*>(pkt->
senderState);
 
  314     DPRINTF(GPUFetch, 
"CU%d: WF[%d][%d]: Fetch addr %#x returned " 
  357         "Cache line size should be a power of two.");
 
  372     restartFromBranch = 
true;
 
  383         freeList.push_back(bufStart + 
i * cacheLineSize);
 
  386     DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d Fetch dropped, flushing fetch " 
  387             "buffer\n", wavefront->simdId, wavefront->wfSlotId,
 
  396     if (bufferedAndReservedLines()) {
 
  397         Addr last_line_fetched = 0;
 
  398         if (!reservedLines()) {
 
  403             last_line_fetched = bufferedPCs.rbegin()->first;
 
  405             last_line_fetched = reservedPCs.rbegin()->first;
 
  408         next_line = last_line_fetched + cacheLineSize;
 
  414         assert(bufferedPCs.find(next_line) == bufferedPCs.end());
 
  415         assert(reservedPCs.find(next_line) == reservedPCs.end());
 
  434         if (restartFromBranch) {
 
  435             restartFromBranch = 
false;
 
  439             readPtr += byte_offset;
 
  451     assert(hasFreeSpace());
 
  452     assert(bufferedPCs.find(
vaddr) == bufferedPCs.end());
 
  453     assert(reservedPCs.find(
vaddr) == reservedPCs.end());
 
  454     assert(bufferedAndReservedLines() < 
fetchDepth);
 
  456     DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d reserved fetch buffer entry " 
  457             "for PC = %#x\n", wavefront->simdId, wavefront->wfSlotId,
 
  458             wavefront->wfDynId, 
vaddr);
 
  466     uint8_t *inst_buf = freeList.front();
 
  467     reservedPCs.emplace(
vaddr, inst_buf);
 
  468     freeList.pop_front();
 
  474     assert(bufferedPCs.find(
vaddr) == bufferedPCs.end());
 
  475     DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d done fetching for addr %#x\n",
 
  476             wavefront->simdId, wavefront->wfSlotId,
 
  477             wavefront->wfDynId, 
vaddr);
 
  484     auto reserved_pc = reservedPCs.find(
vaddr);
 
  485     assert(reserved_pc != reservedPCs.end());
 
  486     bufferedPCs.emplace(
vaddr, reserved_pc->second);
 
  488     if (readPtr == bufEnd) {
 
  492     reserved_pc->second = 
nullptr;
 
  493     reservedPCs.erase(reserved_pc);
 
  506                                  wavefront->computeUnit->cacheLineSize());
 
  507     if (reservedPCs.find(cur_wave_pc) != reservedPCs.end()) {
 
  508         DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d current wave PC(%#x) still " 
  509                 "being fetched.\n", wavefront->simdId, wavefront->wfSlotId,
 
  510                 wavefront->wfDynId, cur_wave_pc);
 
  513         assert(bufferedPCs.find(cur_wave_pc) == bufferedPCs.end());
 
  518     auto current_buffered_pc = bufferedPCs.find(cur_wave_pc);
 
  519     auto oldest_buffered_pc = bufferedPCs.begin();
 
  521     DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d checking if PC block addr = %#x" 
  522             "(PC = %#x) can be released.\n", wavefront->simdId,
 
  523             wavefront->wfSlotId, wavefront->wfDynId, cur_wave_pc,
 
  528     for (
const auto &buf_pc : bufferedPCs) {
 
  529         DPRINTF(GPUFetch, 
"PC[%d] = %#x\n", idx, buf_pc.first);
 
  536     assert(current_buffered_pc != bufferedPCs.end());
 
  544     if (current_buffered_pc != oldest_buffered_pc) {
 
  545         DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d done fetching for PC = %#x, " 
  546                 "removing it from the fetch buffer.\n", wavefront->simdId,
 
  547                 wavefront->wfSlotId, wavefront->wfDynId,
 
  548                 oldest_buffered_pc->first);
 
  550         freeList.emplace_back(oldest_buffered_pc->second);
 
  551         oldest_buffered_pc->second = 
nullptr;
 
  552         bufferedPCs.erase(oldest_buffered_pc);
 
  553         DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d has %d lines buffered.\n",
 
  554                 wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
 
  568     while (wavefront->instructionBuffer.size() < maxIbSize
 
  569            && hasFetchDataToProcess()) {
 
  575             GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
 
  576             readPtr += gpu_static_inst->
instSize();
 
  578             assert(readPtr <= bufEnd);
 
  581                 = std::make_shared<GPUDynInst>(wavefront->computeUnit,
 
  582                                                wavefront, gpu_static_inst,
 
  583                                                wavefront->computeUnit->
 
  585             wavefront->instructionBuffer.push_back(gpu_dyn_inst);
 
  587             DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%ld decoded %s (%d bytes). " 
  588                     "%d bytes remain.\n", wavefront->simdId,
 
  589                     wavefront->wfSlotId, wavefront->wfDynId,
 
  592                     fetchBytesRemaining());
 
  601     int dword_size = 
sizeof(uint32_t);
 
  604     for (
int i = 0; 
i < num_dwords; ++
i) {
 
  606             *
reinterpret_cast<uint32_t*
>(readPtr));
 
  607         if (readPtr + dword_size >= bufEnd) {
 
  612     assert(readPtr == bufStart);
 
  616     GPUStaticInst *gpu_static_inst = _decoder->decode(mach_inst);
 
  617     readPtr += (gpu_static_inst->
instSize() - dword_size);
 
  618     assert(readPtr < bufEnd);
 
  621         = std::make_shared<GPUDynInst>(wavefront->computeUnit,
 
  622                                        wavefront, gpu_static_inst,
 
  623                                        wavefront->computeUnit->
 
  625     wavefront->instructionBuffer.push_back(gpu_dyn_inst);
 
  627     DPRINTF(GPUFetch, 
"WF[%d][%d]: Id%d decoded split inst %s (%#x) " 
  628             "(%d bytes). %d bytes remain in %d buffered lines.\n",
 
  629             wavefront->simdId, wavefront->wfSlotId, wavefront->wfDynId,
 
  631             gpu_static_inst->
instSize(), fetchBytesRemaining(),
 
  650     int bytes_remaining = 0;
 
  652     if (bufferedLines() && readPtr != bufEnd) {
 
  653         auto last_buf_pc = bufferedPCs.rbegin();
 
  654         uint8_t *end_ptr = last_buf_pc->second + cacheLineSize;
 
  655         int byte_diff = end_ptr - readPtr;
 
  657         if (end_ptr > readPtr) {
 
  658             bytes_remaining = byte_diff;
 
  659         } 
else if (end_ptr < readPtr) {
 
  660             bytes_remaining = bufferedBytes() + byte_diff;
 
  664     assert(bytes_remaining <= bufferedBytes());
 
  665     return bytes_remaining;
 
  671     reqPkt->makeResponse();
 
  672     fetchUnit->computeUnit.handleSQCReturn(reqPkt);
 
void sendRequest(PacketPtr pkt, Event *callback)
std::deque< PacketPtr > retries
here we queue all the translation requests that were not successfully sent.
std::deque< std::pair< PacketPtr, Wavefront * > > retries
RequestorID vramRequestorId()
Forward the VRAM requestor ID needed for device memory from shader.
int getCacheLineBits() const
int cacheLineSize() const
RequestorID requestorId()
void fetchDone(Addr vaddr)
void reserveBuf(Addr vaddr)
reserve an entry in the fetch buffer for PC = vaddr,
uint8_t * readPtr
pointer that points to the next chunk of inst data to be decoded.
int fetchBytesRemaining() const
calculates the number of fetched bytes that have yet to be decoded.
void checkWaveReleaseBuf()
checks if the wavefront can release any of its fetch buffer entries.
bool hasFetchDataToProcess() const
checks if the buffer contains valid data.
void allocateBuf(int fetch_depth, int cache_line_size, Wavefront *wf)
allocate the fetch buffer space, and set the fetch depth (number of lines that may be buffered),...
bool splitDecode() const
check if the next instruction to be processed out of the fetch buffer is split across the end/beginni...
void decodeInsts()
each time the fetch stage is ticked, we check if there are any data in the fetch buffer that may be d...
uint8_t * bufStart
raw instruction buffer.
std::deque< uint8_t * > freeList
represents the fetch buffer free list.
static uint32_t globalFetchUnitID
std::vector< Wavefront * > * waveList
void bindWaveList(std::vector< Wavefront * > *list)
FetchUnit(const ComputeUnitParams &p, ComputeUnit &cu)
void fetch(PacketPtr pkt, Wavefront *wavefront)
std::vector< Wavefront * > fetchQueue
void initiateFetch(Wavefront *wavefront)
int fetchDepth
number of cache lines we can fetch and buffer.
TheGpuISA::Decoder decoder
ComputeUnit & computeUnit
void processFetchReturn(PacketPtr pkt)
std::vector< FetchBufDesc > fetchBuf
void flushBuf(int wfSlotId)
std::vector< std::pair< Wavefront *, bool > > fetchStatusQueue
const std::string & disassemble()
virtual int instSize() const =0
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
SenderState * senderState
This packet's sender state.
RequestPtr req
A pointer to the original request.
MemCmd cmd
The command field of the packet.
bool sendTimingReq(PacketPtr pkt)
Attempt to send a timing request to the responder port by calling its corresponding receive function.
void sendFunctional(PacketPtr pkt) const
Send a functional request packet, where the data is instantly updated everywhere in the memory system...
@ INST_FETCH
The request was an instruction fetch.
void bindList(std::vector< Wavefront * > *sched_list)
AMDGPUSystemHub * systemHub
std::deque< GPUDynInstPtr > instructionBuffer
@ S_WAITCNT
wavefront has unsatisfied wait counts
static constexpr std::enable_if_t< std::is_integral_v< T >, int > floorLog2(T x)
static constexpr bool isPowerOf2(const T &n)
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
constexpr void replaceBits(T &val, unsigned first, unsigned last, B bit_val)
A convenience function to replace bits first to last of val with bit_val in place.
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
InstFormat * MachInst
used to represent the encoding of a GCN3 inst.
uint64_t RawMachInst
used to represnt a GPU inst in its raw format.
ProbePointArg< PacketInfo > Packet
Packet probe point.
Addr makeLineAddress(Addr addr)
Reference material can be found at the JEDEC website: UFS standard http://www.jedec....
std::shared_ptr< Request > RequestPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
RubyTester::SenderState SenderState
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...