release/v20-0-0-2/GPUCoalescer_8cc_source.html

 /*
  * Copyright (c) 2013-2015 Advanced Micro Devices, Inc.
  * All rights reserved.
  *
  * For use for simulation and test purposes only
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice,
  * this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  * this list of conditions and the following disclaimer in the documentation
  * and/or other materials provided with the distribution.
  *
  * 3. Neither the name of the copyright holder nor the names of its
  * contributors may be used to endorse or promote products derived from this
  * software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */

 #include "base/logging.hh"
 #include "base/str.hh"
 #include "config/the_isa.hh"

 #if THE_ISA == X86_ISA
 #include "arch/x86/insts/microldstop.hh"

 #endif // X86_ISA
 #include "mem/ruby/system/GPUCoalescer.hh"

 #include "cpu/testers/rubytest/RubyTester.hh"
 #include "debug/GPUCoalescer.hh"
 #include "debug/MemoryAccess.hh"
 #include "debug/ProtocolTrace.hh"
 #include "debug/RubyPort.hh"
 #include "debug/RubyStats.hh"
 #include "gpu-compute/shader.hh"
 #include "mem/packet.hh"
 #include "mem/ruby/common/DataBlock.hh"
 #include "mem/ruby/common/SubBlock.hh"
 #include "mem/ruby/network/MessageBuffer.hh"
 #include "mem/ruby/profiler/Profiler.hh"
 #include "mem/ruby/slicc_interface/AbstractController.hh"
 #include "mem/ruby/slicc_interface/RubyRequest.hh"
 #include "mem/ruby/structures/CacheMemory.hh"
 #include "mem/ruby/system/RubySystem.hh"
 #include "params/RubyGPUCoalescer.hh"

 using namespace std;

 GPUCoalescer *
 RubyGPUCoalescerParams::create()
 {
     return new GPUCoalescer(this);
 }

 HSAScope
 reqScopeToHSAScope(const RequestPtr &req)
 {
     HSAScope accessScope = HSAScope_UNSPECIFIED;
     if (req->isScoped()) {
         if (req->isWavefrontScope()) {
             accessScope = HSAScope_WAVEFRONT;
         } else if (req->isWorkgroupScope()) {
             accessScope = HSAScope_WORKGROUP;
         } else if (req->isDeviceScope()) {
             accessScope = HSAScope_DEVICE;
         } else if (req->isSystemScope()) {
             accessScope = HSAScope_SYSTEM;
         } else {
             fatal("Bad scope type");
         }
     }
     return accessScope;
 }

 HSASegment
 reqSegmentToHSASegment(const RequestPtr &req)
 {
     HSASegment accessSegment = HSASegment_GLOBAL;

     if (req->isGlobalSegment()) {
         accessSegment = HSASegment_GLOBAL;
     } else if (req->isGroupSegment()) {
         accessSegment = HSASegment_GROUP;
     } else if (req->isPrivateSegment()) {
         accessSegment = HSASegment_PRIVATE;
     } else if (req->isKernargSegment()) {
         accessSegment = HSASegment_KERNARG;
     } else if (req->isReadonlySegment()) {
         accessSegment = HSASegment_READONLY;
     } else if (req->isSpillSegment()) {
         accessSegment = HSASegment_SPILL;
     } else if (req->isArgSegment()) {
         accessSegment = HSASegment_ARG;
     } else {
         fatal("Bad segment type");
     }

     return accessSegment;
 }

 UncoalescedTable::UncoalescedTable(GPUCoalescer *gc)
     : coalescer(gc)
 {
 }

 void
 UncoalescedTable::insertPacket(PacketPtr pkt)
 {
     uint64_t seqNum = pkt->req->getReqInstSeqNum();

     instMap[seqNum].push_back(pkt);
     DPRINTF(GPUCoalescer, "Adding 0x%X seqNum %d to map. (map %d vec %d)\n",
             pkt->getAddr(), seqNum, instMap.size(), instMap[seqNum].size());
 }

 bool
 UncoalescedTable::packetAvailable()
 {
     return !instMap.empty();
 }

 PerInstPackets*
 UncoalescedTable::getInstPackets(int offset)
 {
     if (offset >= instMap.size()) {
         return nullptr;
     }

     auto instMapIter = instMap.begin();
     std::advance(instMapIter, offset);

     return &(instMapIter->second);
 }

 void
 UncoalescedTable::updateResources()
 {
     for (auto iter = instMap.begin(); iter != instMap.end(); ) {
         if (iter->second.empty()) {
             instMap.erase(iter++);
             coalescer->getGMTokenPort().sendTokens(1);
         } else {
             ++iter;
         }
     }
 }

 void
 UncoalescedTable::printRequestTable(std::stringstream& ss)
 {
     ss << "UncoalescedTable contains " << instMap.size()
        << " address entries." << std::endl;
     for (auto& inst : instMap) {
         ss << "Addr 0x" << std::hex << inst.first << std::dec
            << " with " << inst.second.size() << " packets"
            << std::endl;
     }
 }

 void
 UncoalescedTable::checkDeadlock(Tick threshold)
 {
     Tick current_time = curTick();

     for (auto &it : instMap) {
         for (auto &pkt : it.second) {
             if (current_time - pkt->req->time() > threshold) {
                 std::stringstream ss;
                 printRequestTable(ss);

                 panic("Possible Deadlock detected. Aborting!\n"
                      "version: %d request.paddr: 0x%x uncoalescedTable: %d "
                      "current time: %u issue_time: %d difference: %d\n"
                      "Request Tables:\n\n%s", coalescer->getId(),
                       pkt->getAddr(), instMap.size(), current_time,
                       pkt->req->time(), current_time - pkt->req->time(),
                       ss.str());
             }
         }
     }
 }

 GPUCoalescer::GPUCoalescer(const Params *p)
     : RubyPort(p),
       issueEvent([this]{ completeIssue(); }, "Issue coalesced request",
                  false, Event::Progress_Event_Pri),
       uncoalescedTable(this),
       deadlockCheckEvent([this]{ wakeup(); }, "GPUCoalescer deadlock check"),
       gmTokenPort(name() + ".gmTokenPort", this)
 {
     m_store_waiting_on_load_cycles = 0;
     m_store_waiting_on_store_cycles = 0;
     m_load_waiting_on_store_cycles = 0;
     m_load_waiting_on_load_cycles = 0;

     m_outstanding_count = 0;

     coalescingWindow = p->max_coalesces_per_cycle;

     m_max_outstanding_requests = 0;
     m_instCache_ptr = nullptr;
     m_dataCache_ptr = nullptr;

     m_instCache_ptr = p->icache;
     m_dataCache_ptr = p->dcache;
     m_max_outstanding_requests = p->max_outstanding_requests;
     m_deadlock_threshold = p->deadlock_threshold;

     assert(m_max_outstanding_requests > 0);
     assert(m_deadlock_threshold > 0);
     assert(m_instCache_ptr);
     assert(m_dataCache_ptr);

     m_runningGarnetStandalone = p->garnet_standalone;
     assumingRfOCoherence = p->assume_rfo;
 }

 GPUCoalescer::~GPUCoalescer()
 {
 }

 Port &
 GPUCoalescer::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "gmTokenPort") {
         return gmTokenPort;
     }

     // delgate to RubyPort otherwise
     return RubyPort::getPort(if_name, idx);
 }

 void
 GPUCoalescer::wakeup()
 {
     Cycles current_time = curCycle();
     for (auto& requestList : coalescedTable) {
         for (auto& req : requestList.second) {
             if (current_time - req->getIssueTime() > m_deadlock_threshold) {
                 std::stringstream ss;
                 printRequestTable(ss);
                 ss << "Outstanding requests: " << m_outstanding_count
                    << std::endl;

                 panic("Possible Deadlock detected. Aborting!\n"
                      "version: %d request.paddr: 0x%x coalescedTable: %d "
                      "current time: %u issue_time: %d difference: %d\n"
                      "Request Tables:\n %s", m_version,
                       req->getFirstPkt()->getAddr(),
                       coalescedTable.size(), cyclesToTicks(current_time),
                       cyclesToTicks(req->getIssueTime()),
                       cyclesToTicks(current_time - req->getIssueTime()),
                       ss.str());
             }
         }
     }

     Tick tick_threshold = cyclesToTicks(m_deadlock_threshold);
     uncoalescedTable.checkDeadlock(tick_threshold);

     if (m_outstanding_count > 0) {
         schedule(deadlockCheckEvent,
                  m_deadlock_threshold * clockPeriod() +
                  curTick());
     }
 }

 void
 GPUCoalescer::printRequestTable(std::stringstream& ss)
 {
     uncoalescedTable.printRequestTable(ss);

     ss << "CoalescedTable contains " << coalescedTable.size()
        << " address entries." << std::endl;
     for (auto& requestList : coalescedTable) {
         ss << "Addr 0x" << std::hex << requestList.first << std::dec
            << ": type-";
         for (auto& request : requestList.second) {
             ss << RubyRequestType_to_string(request->getRubyType())
                << " pkts-" << request->getPackets().size()
                << " issued-" << request->getIssueTime() << " seqNum-"
                << request->getSeqNum() << "; ";
         }
         ss << std::endl;
     }
 }

 void
 GPUCoalescer::resetStats()
 {
     m_latencyHist.reset();
     m_missLatencyHist.reset();
     for (int i = 0; i < RubyRequestType_NUM; i++) {
         m_typeLatencyHist[i]->reset();
         m_missTypeLatencyHist[i]->reset();
         for (int j = 0; j < MachineType_NUM; j++) {
             m_missTypeMachLatencyHist[i][j]->reset();
         }
     }

     for (int i = 0; i < MachineType_NUM; i++) {
         m_missMachLatencyHist[i]->reset();

         m_IssueToInitialDelayHist[i]->reset();
         m_InitialToForwardDelayHist[i]->reset();
         m_ForwardToFirstResponseDelayHist[i]->reset();
         m_FirstResponseToCompletionDelayHist[i]->reset();
     }
 }

 void
 GPUCoalescer::printProgress(ostream& out) const
 {
 }

 // sets the kernelEndList
 void
 GPUCoalescer::insertKernel(int wavefront_id, PacketPtr pkt)
 {
     // Don't know if this will happen or is possible
     // but I just want to be careful and not have it become
     // simulator hang in the future
     DPRINTF(GPUCoalescer, "inserting wf: %d to kernelEndlist\n", wavefront_id);
     assert(kernelEndList.count(wavefront_id) == 0);

     kernelEndList[wavefront_id] = pkt;
     DPRINTF(GPUCoalescer, "kernelEndList->size() = %d\n",
             kernelEndList.size());
 }

 void
 GPUCoalescer::writeCallback(Addr address, DataBlock& data)
 {
     writeCallback(address, MachineType_NULL, data);
 }

 void
 GPUCoalescer::writeCallback(Addr address,
                          MachineType mach,
                          DataBlock& data)
 {
     writeCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 }

 void
 GPUCoalescer::writeCallback(Addr address,
                          MachineType mach,
                          DataBlock& data,
                          Cycles initialRequestTime,
                          Cycles forwardRequestTime,
                          Cycles firstResponseTime)
 {
     writeCallback(address, mach, data,
                   initialRequestTime, forwardRequestTime, firstResponseTime,
                   false);
 }

 void
 GPUCoalescer::writeCallback(Addr address,
                          MachineType mach,
                          DataBlock& data,
                          Cycles initialRequestTime,
                          Cycles forwardRequestTime,
                          Cycles firstResponseTime,
                          bool isRegion)
 {
     assert(address == makeLineAddress(address));
     assert(coalescedTable.count(address));

     auto crequest = coalescedTable.at(address).front();

     hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
                 forwardRequestTime, firstResponseTime, isRegion);

     delete crequest;
     coalescedTable.at(address).pop_front();

     if (coalescedTable.at(address).empty()) {
         coalescedTable.erase(address);
     } else {
         auto nextRequest = coalescedTable.at(address).front();
         issueRequest(nextRequest);
     }
 }

 void
 GPUCoalescer::readCallback(Addr address, DataBlock& data)
 {
     readCallback(address, MachineType_NULL, data);
 }

 void
 GPUCoalescer::readCallback(Addr address,
                         MachineType mach,
                         DataBlock& data)
 {
     readCallback(address, mach, data, Cycles(0), Cycles(0), Cycles(0));
 }

 void
 GPUCoalescer::readCallback(Addr address,
                         MachineType mach,
                         DataBlock& data,
                         Cycles initialRequestTime,
                         Cycles forwardRequestTime,
                         Cycles firstResponseTime)
 {

     readCallback(address, mach, data,
                  initialRequestTime, forwardRequestTime, firstResponseTime,
                  false);
 }

 void
 GPUCoalescer::readCallback(Addr address,
                         MachineType mach,
                         DataBlock& data,
                         Cycles initialRequestTime,
                         Cycles forwardRequestTime,
                         Cycles firstResponseTime,
                         bool isRegion)
 {
     assert(address == makeLineAddress(address));
     assert(coalescedTable.count(address));

     auto crequest = coalescedTable.at(address).front();
     fatal_if(crequest->getRubyType() != RubyRequestType_LD,
              "readCallback received non-read type response\n");

     // Iterate over the coalesced requests to respond to as many loads as
     // possible until another request type is seen. Models MSHR for TCP.
     while (crequest->getRubyType() == RubyRequestType_LD) {
         hitCallback(crequest, mach, data, true, crequest->getIssueTime(),
                     forwardRequestTime, firstResponseTime, isRegion);

         delete crequest;
         coalescedTable.at(address).pop_front();
         if (coalescedTable.at(address).empty()) {
             break;
         }

         crequest = coalescedTable.at(address).front();
     }

     if (coalescedTable.at(address).empty()) {
         coalescedTable.erase(address);
     } else {
         auto nextRequest = coalescedTable.at(address).front();
         issueRequest(nextRequest);
     }
 }

 void
 GPUCoalescer::hitCallback(CoalescedRequest* crequest,
                        MachineType mach,
                        DataBlock& data,
                        bool success,
                        Cycles initialRequestTime,
                        Cycles forwardRequestTime,
                        Cycles firstResponseTime,
                        bool isRegion)
 {
     PacketPtr pkt = crequest->getFirstPkt();
     Addr request_address = pkt->getAddr();
     Addr request_line_address = makeLineAddress(request_address);

     RubyRequestType type = crequest->getRubyType();

     DPRINTF(GPUCoalescer, "Got hitCallback for 0x%X\n", request_line_address);

     recordMissLatency(crequest, mach,
                       initialRequestTime,
                       forwardRequestTime,
                       firstResponseTime,
                       success, isRegion);
     // update the data
     //
     // MUST AD DOING THIS FOR EACH REQUEST IN COALESCER
     std::vector<PacketPtr> pktList = crequest->getPackets();
     DPRINTF(GPUCoalescer, "Responding to %d packets for addr 0x%X\n",
             pktList.size(), request_line_address);
     for (auto& pkt : pktList) {
         request_address = pkt->getAddr();
         if (pkt->getPtr<uint8_t>()) {
             if ((type == RubyRequestType_LD) ||
                 (type == RubyRequestType_ATOMIC) ||
                 (type == RubyRequestType_ATOMIC_RETURN) ||
                 (type == RubyRequestType_IFETCH) ||
                 (type == RubyRequestType_RMW_Read) ||
                 (type == RubyRequestType_Locked_RMW_Read) ||
                 (type == RubyRequestType_Load_Linked)) {
                 pkt->setData(
                     data.getData(getOffset(request_address), pkt->getSize()));
             } else {
                 data.setData(pkt->getPtr<uint8_t>(),
                              getOffset(request_address), pkt->getSize());
             }
         } else {
             DPRINTF(MemoryAccess,
                     "WARNING.  Data not transfered from Ruby to M5 for type " \
                     "%s\n",
                     RubyRequestType_to_string(type));
         }

         // If using the RubyTester, update the RubyTester sender state's
         // subBlock with the recieved data.  The tester will later access
         // this state.
         // Note: RubyPort will access it's sender state before the
         // RubyTester.
         if (m_usingRubyTester) {
             RubyPort::SenderState *requestSenderState =
                 safe_cast<RubyPort::SenderState*>(pkt->senderState);
             RubyTester::SenderState* testerSenderState =
                 safe_cast<RubyTester::SenderState*>
                     (requestSenderState->predecessor);
             testerSenderState->subBlock.mergeFrom(data);
         }
     }


     m_outstanding_count--;
     assert(m_outstanding_count >= 0);

     completeHitCallback(pktList);
 }

 bool
 GPUCoalescer::empty() const
 {
     return coalescedTable.empty();
 }

 RubyRequestType
 GPUCoalescer::getRequestType(PacketPtr pkt)
 {
     RubyRequestType req_type = RubyRequestType_NULL;

     // These types are not support or not used in GPU caches.
     assert(!pkt->req->isLLSC());
     assert(!pkt->req->isLockedRMW());
     assert(!pkt->req->isInstFetch());
     assert(!pkt->isFlush());

     if (pkt->req->isAtomicReturn()) {
         req_type = RubyRequestType_ATOMIC_RETURN;
     } else if (pkt->req->isAtomicNoReturn()) {
         req_type = RubyRequestType_ATOMIC_NO_RETURN;
     } else if (pkt->isRead()) {
         req_type = RubyRequestType_LD;
     } else if (pkt->isWrite()) {
         req_type = RubyRequestType_ST;
     } else {
         // Acquire and release packets will have been issued by
         // makeRequest, so we do not need to check for it here.
         panic("Unsupported ruby packet type\n");
     }

     return req_type;
 }

 // Places an uncoalesced packet in uncoalescedTable. If the packet is a
 // special type (MemFence, scoping, etc), it is issued immediately.
 RequestStatus
 GPUCoalescer::makeRequest(PacketPtr pkt)
 {
     // Check for GPU Barrier Kernel End or Kernel Begin
     // Leave these to be handled by the child class
     // Kernel End/Barrier = isFlush + isRelease
     // Kernel Begin = isFlush + isAcquire
     if (pkt->req->isKernel()) {
         if (pkt->req->isAcquire()){
             // This is a Kernel Begin leave handling to
             // virtual xCoalescer::makeRequest
             return RequestStatus_Issued;
         }else if (pkt->req->isRelease()) {
             // This is a Kernel End leave handling to
             // virtual xCoalescer::makeRequest
             // If we are here then we didn't call
             // a virtual version of this function
             // so we will also schedule the callback
             int wf_id = 0;
             if (pkt->req->hasContextId()) {
                 wf_id = pkt->req->contextId();
             }
             insertKernel(wf_id, pkt);
             newKernelEnds.push_back(wf_id);
             if (!issueEvent.scheduled()) {
                 schedule(issueEvent, curTick());
             }
             return RequestStatus_Issued;
         }
     }

     if (!pkt->isLLSC() && !pkt->req->isLockedRMW() && !pkt->isAtomicOp() &&
         !pkt->isRead() && !pkt->isWrite() && !pkt->isFlush() &&
         (pkt->req->isRelease() || pkt->req->isAcquire())) {
         if (assumingRfOCoherence) {
             // If we reached here, this request must be a memFence
             // and the protocol implements RfO, the coalescer can
             // assume sequentially consistency and schedule the callback
             // immediately.
             // Currently the code implements fence callbacks
             // by reusing the mechanism for kernel completions.
             // This should be fixed.
             int wf_id = 0;
             if (pkt->req->hasContextId()) {
                 wf_id = pkt->req->contextId();
             }
             insertKernel(wf_id, pkt);
             newKernelEnds.push_back(wf_id);
             if (!issueEvent.scheduled()) {
                 schedule(issueEvent, curTick());
             }
             return RequestStatus_Issued;
         } else {
             // If not RfO, return issued here and let the child coalescer
             // take care of it.
             return RequestStatus_Issued;
         }
     }

     uncoalescedTable.insertPacket(pkt);
     DPRINTF(GPUCoalescer, "UC insertPacket 0x%X\n", pkt->getAddr());

     if (!issueEvent.scheduled())
         schedule(issueEvent, curTick());
     // TODO: issue hardware prefetches here
     return RequestStatus_Issued;
 }

 void
 GPUCoalescer::issueRequest(CoalescedRequest* crequest)
 {
     PacketPtr pkt = crequest->getFirstPkt();

     int proc_id = -1;
     if (pkt != NULL && pkt->req->hasContextId()) {
         proc_id = pkt->req->contextId();
     }

     // If valid, copy the pc to the ruby request
     Addr pc = 0;
     if (pkt->req->hasPC()) {
         pc = pkt->req->getPC();
     }

     // At the moment setting scopes only counts
     // for GPU spill space accesses
     // which is pkt->req->isStack()
     // this scope is REPLACE since it
     // does not need to be flushed at the end
     // of a kernel Private and local may need
     // to be visible at the end of the kernel
     HSASegment accessSegment = reqSegmentToHSASegment(pkt->req);
     HSAScope accessScope = reqScopeToHSAScope(pkt->req);

     Addr line_addr = makeLineAddress(pkt->getAddr());

     // Creating WriteMask that records written bytes
     // and atomic operations. This enables partial writes
     // and partial reads of those writes
     DataBlock dataBlock;
     dataBlock.clear();
     uint32_t blockSize = RubySystem::getBlockSizeBytes();
     std::vector<bool> accessMask(blockSize,false);
     std::vector< std::pair<int,AtomicOpFunctor*> > atomicOps;
     uint32_t tableSize = crequest->getPackets().size();
     for (int i = 0; i < tableSize; i++) {
         PacketPtr tmpPkt = crequest->getPackets()[i];
         uint32_t tmpOffset = (tmpPkt->getAddr()) - line_addr;
         uint32_t tmpSize = tmpPkt->getSize();
         if (tmpPkt->isAtomicOp()) {
             std::pair<int,AtomicOpFunctor *> tmpAtomicOp(tmpOffset,
                                                         tmpPkt->getAtomicOp());
             atomicOps.push_back(tmpAtomicOp);
         } else if (tmpPkt->isWrite()) {
             dataBlock.setData(tmpPkt->getPtr<uint8_t>(),
                               tmpOffset, tmpSize);
         }
         for (int j = 0; j < tmpSize; j++) {
             accessMask[tmpOffset + j] = true;
         }
     }
     std::shared_ptr<RubyRequest> msg;
     if (pkt->isAtomicOp()) {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
                               pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
                               dataBlock, atomicOps,
                               accessScope, accessSegment);
     } else {
         msg = std::make_shared<RubyRequest>(clockEdge(), pkt->getAddr(),
                               pkt->getPtr<uint8_t>(),
                               pkt->getSize(), pc, crequest->getRubyType(),
                               RubyAccessMode_Supervisor, pkt,
                               PrefetchBit_No, proc_id, 100,
                               blockSize, accessMask,
                               dataBlock,
                               accessScope, accessSegment);
     }
     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %s\n",
              curTick(), m_version, "Coal", "Begin", "", "",
              printAddress(msg->getPhysicalAddress()),
              RubyRequestType_to_string(crequest->getRubyType()));

     fatal_if(crequest->getRubyType() == RubyRequestType_IFETCH,
              "there should not be any I-Fetch requests in the GPU Coalescer");

     Tick latency = cyclesToTicks(
                 m_controller->mandatoryQueueLatency(crequest->getRubyType()));
     assert(latency > 0);

     if (!deadlockCheckEvent.scheduled()) {
         schedule(deadlockCheckEvent,
                  m_deadlock_threshold * clockPeriod() +
                  curTick());
     }

     assert(m_mandatory_q_ptr);
     m_mandatory_q_ptr->enqueue(msg, clockEdge(), latency);
 }

 template <class KEY, class VALUE>
 std::ostream &
 operator<<(ostream &out, const std::unordered_map<KEY, VALUE> &map)
 {
     out << "[";
     for (auto i = map.begin(); i != map.end(); ++i)
         out << " " << i->first << "=" << i->second;
     out << " ]";

     return out;
 }

 void
 GPUCoalescer::print(ostream& out) const
 {
     out << "[GPUCoalescer: " << m_version
         << ", outstanding requests: " << m_outstanding_count
         << "]";
 }


 void
 GPUCoalescer::recordRequestType(SequencerRequestType requestType) {
     DPRINTF(RubyStats, "Recorded statistic: %s\n",
             SequencerRequestType_to_string(requestType));
 }

 bool
 GPUCoalescer::coalescePacket(PacketPtr pkt)
 {
     uint64_t seqNum = pkt->req->getReqInstSeqNum();
     Addr line_addr = makeLineAddress(pkt->getAddr());

     // If the packet has the same line address as a request already in the
     // coalescedTable and has the same sequence number, it can be coalesced.
     if (coalescedTable.count(line_addr)) {
         // Search for a previous coalesced request with the same seqNum.
         auto& creqQueue = coalescedTable.at(line_addr);
         auto citer = std::find_if(creqQueue.begin(), creqQueue.end(),
             [&](CoalescedRequest* c) { return c->getSeqNum() == seqNum; }
         );
         if (citer != creqQueue.end()) {
             (*citer)->insertPacket(pkt);
             return true;
         }
     }

     if (m_outstanding_count < m_max_outstanding_requests) {
         // This is an "aliased" or new request. Create a RubyRequest and
         // append it to the list of "targets" in the coalescing table.
         DPRINTF(GPUCoalescer, "Creating new or aliased request for 0x%X\n",
                 line_addr);

         CoalescedRequest *creq = new CoalescedRequest(seqNum);
         creq->insertPacket(pkt);
         creq->setRubyType(getRequestType(pkt));
         creq->setIssueTime(curCycle());

         if (!coalescedTable.count(line_addr)) {
             // If there is no outstanding request for this line address,
             // create a new coalecsed request and issue it immediately.
             auto reqList = std::deque<CoalescedRequest*> { creq };
             coalescedTable.insert(std::make_pair(line_addr, reqList));

             DPRINTF(GPUCoalescer, "Issued req type %s seqNum %d\n",
                     RubyRequestType_to_string(creq->getRubyType()), seqNum);
             issueRequest(creq);
         } else {
             // The request is for a line address that is already outstanding
             // but for a different instruction. Add it as a new request to be
             // issued when the current outstanding request is completed.
             coalescedTable.at(line_addr).push_back(creq);
             DPRINTF(GPUCoalescer, "found address 0x%X with new seqNum %d\n",
                     line_addr, seqNum);
         }

         // In both cases, requests are added to the coalescing table and will
         // be counted as outstanding requests.
         m_outstanding_count++;

         return true;
     }

     // The maximum number of outstanding requests have been issued.
     return false;
 }

 void
 GPUCoalescer::completeIssue()
 {
     // Iterate over the maximum number of instructions we can coalesce
     // per cycle (coalescingWindow).
     for (int instIdx = 0; instIdx < coalescingWindow; ++instIdx) {
         PerInstPackets *pktList =
             uncoalescedTable.getInstPackets(instIdx);

         // getInstPackets will return nullptr if no instruction
         // exists at the current offset.
         if (!pktList) {
             break;
         } else {
             // Since we have a pointer to the list of packets in the inst,
             // erase them from the list if coalescing is successful and
             // leave them in the list otherwise. This aggressively attempts
             // to coalesce as many packets as possible from the current inst.
             pktList->remove_if(
                 [&](PacketPtr pkt) { return coalescePacket(pkt); }
             );
         }
     }

     // Clean up any instructions in the uncoalesced table that have had
     // all of their packets coalesced and return a token for that column.
     uncoalescedTable.updateResources();

     // have Kernel End releases been issued this cycle
     int len = newKernelEnds.size();
     for (int i = 0; i < len; i++) {
         kernelCallback(newKernelEnds[i]);
     }
     newKernelEnds.clear();
 }

 void
 GPUCoalescer::evictionCallback(Addr address)
 {
     ruby_eviction_callback(address);
 }

 void
 GPUCoalescer::kernelCallback(int wavefront_id)
 {
     assert(kernelEndList.count(wavefront_id));

     ruby_hit_callback(kernelEndList[wavefront_id]);

     kernelEndList.erase(wavefront_id);
 }

 void
 GPUCoalescer::atomicCallback(Addr address,
                              MachineType mach,
                              const DataBlock& data)
 {
     assert(address == makeLineAddress(address));
     assert(coalescedTable.count(address));

     auto crequest = coalescedTable.at(address).front();

     fatal_if((crequest->getRubyType() != RubyRequestType_ATOMIC &&
               crequest->getRubyType() != RubyRequestType_ATOMIC_RETURN &&
               crequest->getRubyType() != RubyRequestType_ATOMIC_NO_RETURN),
              "atomicCallback saw non-atomic type response\n");

     hitCallback(crequest, mach, (DataBlock&)data, true,
                 crequest->getIssueTime(), Cycles(0), Cycles(0), false);

     delete crequest;
     coalescedTable.at(address).pop_front();

     if (coalescedTable.at(address).empty()) {
         coalescedTable.erase(address);
     } else {
         auto nextRequest = coalescedTable.at(address).front();
         issueRequest(nextRequest);
     }
 }

 void
 GPUCoalescer::recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
 {
     if (myMachID == senderMachID) {
         CP_TCPLdHits++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
         CP_TCPLdTransfers++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
         CP_TCCLdHits++;
     } else {
         CP_LdMiss++;
     }
 }

 void
 GPUCoalescer::recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
 {
     if (myMachID == senderMachID) {
         CP_TCPStHits++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCP) {
         CP_TCPStTransfers++;
     } else if (machineIDToMachineType(senderMachID) == MachineType_TCC) {
         CP_TCCStHits++;
     } else {
         CP_StMiss++;
     }
 }

 void
 GPUCoalescer::completeHitCallback(std::vector<PacketPtr> & mylist)
 {
     for (auto& pkt : mylist) {
         RubyPort::SenderState *ss =
             safe_cast<RubyPort::SenderState *>(pkt->senderState);
         MemSlavePort *port = ss->port;
         assert(port != NULL);

         pkt->senderState = ss->predecessor;
         delete ss;
         port->hitCallback(pkt);
         trySendRetries();
     }

     // We schedule an event in the same tick as hitCallback (similar to
     // makeRequest) rather than calling completeIssue directly to reduce
     // function calls to complete issue. This can only happen if the max
     // outstanding requests is less than the number of slots in the
     // uncoalesced table and makeRequest is not called again.
     if (uncoalescedTable.packetAvailable() && !issueEvent.scheduled()) {
         schedule(issueEvent, curTick());
     }

     testDrainComplete();
 }

 void
 GPUCoalescer::recordMissLatency(CoalescedRequest* crequest,
                                 MachineType mach,
                                 Cycles initialRequestTime,
                                 Cycles forwardRequestTime,
                                 Cycles firstResponseTime,
                                 bool success, bool isRegion)
 {
     RubyRequestType type = crequest->getRubyType();
     Cycles issued_time = crequest->getIssueTime();
     Cycles completion_time = curCycle();
     assert(completion_time >= issued_time);
     Cycles total_lat = completion_time - issued_time;

     // cache stats (valid for RfO protocol only)
     if (mach == MachineType_TCP) {
         if (type == RubyRequestType_LD) {
             GPU_TCPLdHits++;
         } else {
             GPU_TCPStHits++;
         }
     } else if (mach == MachineType_L1Cache_wCC) {
         if (type == RubyRequestType_LD) {
             GPU_TCPLdTransfers++;
         } else {
             GPU_TCPStTransfers++;
         }
     } else if (mach == MachineType_TCC) {
         if (type == RubyRequestType_LD) {
             GPU_TCCLdHits++;
         } else {
             GPU_TCCStHits++;
         }
     } else  {
         if (type == RubyRequestType_LD) {
             GPU_LdMiss++;
         } else {
             GPU_StMiss++;
         }
     }

     // Profile all access latency, even zero latency accesses
     m_latencyHist.sample(total_lat);
     m_typeLatencyHist[type]->sample(total_lat);

     // Profile the miss latency for all non-zero demand misses
     if (total_lat != Cycles(0)) {
         m_missLatencyHist.sample(total_lat);
         m_missTypeLatencyHist[type]->sample(total_lat);

         if (mach != MachineType_NUM) {
             m_missMachLatencyHist[mach]->sample(total_lat);
             m_missTypeMachLatencyHist[type][mach]->sample(total_lat);

             if ((issued_time <= initialRequestTime) &&
                 (initialRequestTime <= forwardRequestTime) &&
                 (forwardRequestTime <= firstResponseTime) &&
                 (firstResponseTime <= completion_time)) {

                 m_IssueToInitialDelayHist[mach]->sample(
                     initialRequestTime - issued_time);
                 m_InitialToForwardDelayHist[mach]->sample(
                     forwardRequestTime - initialRequestTime);
                 m_ForwardToFirstResponseDelayHist[mach]->sample(
                     firstResponseTime - forwardRequestTime);
                 m_FirstResponseToCompletionDelayHist[mach]->sample(
                     completion_time - firstResponseTime);
             }
         }

     }

     DPRINTFR(ProtocolTrace, "%15s %3s %10s%20s %6s>%-6s %s %d cycles\n",
              curTick(), m_version, "Coal",
              success ? "Done" : "SC_Failed", "", "",
              printAddress(crequest->getFirstPkt()->getAddr()), total_lat);
 }

 void
 GPUCoalescer::regStats()
 {
     RubyPort::regStats();

     // These statistical variables are not for display.
     // The profiler will collate these across different
     // coalescers and display those collated statistics.
     m_outstandReqHist.init(10);
     m_latencyHist.init(10);
     m_missLatencyHist.init(10);

     for (int i = 0; i < RubyRequestType_NUM; i++) {
         m_typeLatencyHist.push_back(new Stats::Histogram());
         m_typeLatencyHist[i]->init(10);

         m_missTypeLatencyHist.push_back(new Stats::Histogram());
         m_missTypeLatencyHist[i]->init(10);
     }

     for (int i = 0; i < MachineType_NUM; i++) {
         m_missMachLatencyHist.push_back(new Stats::Histogram());
         m_missMachLatencyHist[i]->init(10);

         m_IssueToInitialDelayHist.push_back(new Stats::Histogram());
         m_IssueToInitialDelayHist[i]->init(10);

         m_InitialToForwardDelayHist.push_back(new Stats::Histogram());
         m_InitialToForwardDelayHist[i]->init(10);

         m_ForwardToFirstResponseDelayHist.push_back(new Stats::Histogram());
         m_ForwardToFirstResponseDelayHist[i]->init(10);

         m_FirstResponseToCompletionDelayHist.push_back(new Stats::Histogram());
         m_FirstResponseToCompletionDelayHist[i]->init(10);
     }

     for (int i = 0; i < RubyRequestType_NUM; i++) {
         m_missTypeMachLatencyHist.push_back(std::vector<Stats::Histogram *>());

         for (int j = 0; j < MachineType_NUM; j++) {
             m_missTypeMachLatencyHist[i].push_back(new Stats::Histogram());
             m_missTypeMachLatencyHist[i][j]->init(10);
         }
     }

     // GPU cache stats
     GPU_TCPLdHits
         .name(name() + ".gpu_tcp_ld_hits")
         .desc("loads that hit in the TCP")
         ;
     GPU_TCPLdTransfers
         .name(name() + ".gpu_tcp_ld_transfers")
         .desc("TCP to TCP load transfers")
         ;
     GPU_TCCLdHits
         .name(name() + ".gpu_tcc_ld_hits")
         .desc("loads that hit in the TCC")
         ;
     GPU_LdMiss
         .name(name() + ".gpu_ld_misses")
         .desc("loads that miss in the GPU")
         ;

     GPU_TCPStHits
         .name(name() + ".gpu_tcp_st_hits")
         .desc("stores that hit in the TCP")
         ;
     GPU_TCPStTransfers
         .name(name() + ".gpu_tcp_st_transfers")
         .desc("TCP to TCP store transfers")
         ;
     GPU_TCCStHits
         .name(name() + ".gpu_tcc_st_hits")
         .desc("stores that hit in the TCC")
         ;
     GPU_StMiss
         .name(name() + ".gpu_st_misses")
         .desc("stores that miss in the GPU")
         ;

     // CP cache stats
     CP_TCPLdHits
         .name(name() + ".cp_tcp_ld_hits")
         .desc("loads that hit in the TCP")
         ;
     CP_TCPLdTransfers
         .name(name() + ".cp_tcp_ld_transfers")
         .desc("TCP to TCP load transfers")
         ;
     CP_TCCLdHits
         .name(name() + ".cp_tcc_ld_hits")
         .desc("loads that hit in the TCC")
         ;
     CP_LdMiss
         .name(name() + ".cp_ld_misses")
         .desc("loads that miss in the GPU")
         ;

     CP_TCPStHits
         .name(name() + ".cp_tcp_st_hits")
         .desc("stores that hit in the TCP")
         ;
     CP_TCPStTransfers
         .name(name() + ".cp_tcp_st_transfers")
         .desc("TCP to TCP store transfers")
         ;
     CP_TCCStHits
         .name(name() + ".cp_tcc_st_hits")
         .desc("stores that hit in the TCC")
         ;
     CP_StMiss
         .name(name() + ".cp_st_misses")
         .desc("stores that miss in the GPU")
         ;
 }
GPUCoalescer::getGMTokenPort
GMTokenPort & getGMTokenPort()
Definition: GPUCoalescer.hh:238

CacheMemory.hh

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition: logging.hh:163

str.hh

DPRINTF
#define DPRINTF(x,...)
Definition: trace.hh:222

reqSegmentToHSASegment
HSASegment reqSegmentToHSASegment(const RequestPtr &req)
Definition: GPUCoalescer.cc:91

logging.hh

GPUCoalescer::insertKernel
void insertKernel(int wavefront_id, PacketPtr pkt)
Definition: GPUCoalescer.cc:333

Port
Ports are used to interface objects to each other.
Definition: port.hh:56

GPUCoalescer::atomicCallback
void atomicCallback(Addr address, MachineType mach, const DataBlock &data)
Definition: GPUCoalescer.cc:882

GPUCoalescer::CP_TCCStHits
Stats::Scalar CP_TCCStHits
Definition: GPUCoalescer.hh:371

GPUCoalescer::issueEvent
EventFunctionWrapper issueEvent
Definition: GPUCoalescer.hh:310

GPUCoalescer::empty
bool empty() const
Definition: GPUCoalescer.cc:544

DataBlock::getData
const uint8_t * getData(int offset, int len) const
Definition: DataBlock.cc:95

GPUCoalescer::GPU_TCPStHits
Stats::Scalar GPU_TCPStHits
Definition: GPUCoalescer.hh:359

GPUCoalescer::getPort
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: GPUCoalescer.cc:238

Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition: types.hh:81

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition: logging.hh:171

UncoalescedTable::UncoalescedTable
UncoalescedTable(GPUCoalescer *gc)
Definition: GPUCoalescer.cc:116

Packet::getAtomicOp
AtomicOpFunctor * getAtomicOp() const
Accessor function to atomic op.
Definition: packet.hh:758

GPUCoalescer::m_missLatencyHist
Stats::Histogram m_missLatencyHist
Histogram for holding latency profile of all requests that miss in the controller connected to this s...
Definition: GPUCoalescer.hh:383

CoalescedRequest
Definition: GPUCoalescer.hh:96

ArmISA::i
Bitfield< 7 > i
Definition: miscregs_types.hh:63

AbstractController.hh

GPUCoalescer::m_ForwardToFirstResponseDelayHist
std::vector< Stats::Histogram * > m_ForwardToFirstResponseDelayHist
Definition: GPUCoalescer.hh:394

std::pair
STL pair class.
Definition: stl.hh:58

DataBlock.hh

GPUCoalescer::GPUCoalescer
GPUCoalescer(const Params *)
Definition: GPUCoalescer.cc:198

GPUCoalescer::coalescePacket
bool coalescePacket(PacketPtr pkt)
Definition: GPUCoalescer.cc:770

RubyPort::getPort
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition: RubyPort.cc:91

RequestPtr
std::shared_ptr< Request > RequestPtr
Definition: request.hh:81

GPUCoalescer::GPU_TCPLdHits
Stats::Scalar GPU_TCPLdHits
Definition: GPUCoalescer.hh:354

RubyPort::m_controller
AbstractController * m_controller
Definition: RubyPort.hh:189

RubyPort::SenderState
Definition: RubyPort.hh:138

GPUCoalescer::CP_StMiss
Stats::Scalar CP_StMiss
Definition: GPUCoalescer.hh:372

GPUCoalescer::GPU_TCCStHits
Stats::Scalar GPU_TCCStHits
Definition: GPUCoalescer.hh:361

GPUCoalescer::recordCPWriteCallBack
void recordCPWriteCallBack(MachineID myMachID, MachineID senderMachID)
Definition: GPUCoalescer.cc:925

GPUCoalescer::deadlockCheckEvent
EventFunctionWrapper deadlockCheckEvent
Definition: GPUCoalescer.hh:350

GPUCoalescer::kernelCallback
void kernelCallback(int wavfront_id)
Definition: GPUCoalescer.cc:872

RubyPort::trySendRetries
void trySendRetries()
Definition: RubyPort.cc:449

GPUCoalescer::CP_LdMiss
Stats::Scalar CP_LdMiss
Definition: GPUCoalescer.hh:367

GPUCoalescer::GPU_TCPLdTransfers
Stats::Scalar GPU_TCPLdTransfers
Definition: GPUCoalescer.hh:355

GPUCoalescer::~GPUCoalescer
~GPUCoalescer()
Definition: GPUCoalescer.cc:233

ArmISA::offset
Bitfield< 23, 0 > offset
Definition: types.hh:152

Stats::Histogram::init
Histogram & init(size_type size)
Set the parameters of this histogram.
Definition: statistics.hh:2641

std
Overload hash function for BasicBlockRange type.
Definition: vec_reg.hh:587

RubyTester::SenderState::subBlock
SubBlock subBlock
Definition: RubyTester.hh:88

GPUCoalescer::CP_TCPLdTransfers
Stats::Scalar CP_TCPLdTransfers
Definition: GPUCoalescer.hh:365

GPUCoalescer::m_missMachLatencyHist
std::vector< Stats::Histogram * > m_missMachLatencyHist
Histograms for profiling the latencies for requests that required external messages.
Definition: GPUCoalescer.hh:388

GPUCoalescer::m_max_outstanding_requests
int m_max_outstanding_requests
Definition: GPUCoalescer.hh:315

SubBlock.hh

GPUCoalescer::wakeup
void wakeup()
Definition: GPUCoalescer.cc:249

GPUCoalescer::m_latencyHist
Stats::Histogram m_latencyHist
Histogram for holding latency profile of all requests.
Definition: GPUCoalescer.hh:378

UncoalescedTable::checkDeadlock
void checkDeadlock(Tick threshold)
Definition: GPUCoalescer.cc:176

DataBlock
Definition: DataBlock.hh:40

GPUCoalescer::recordMissLatency
void recordMissLatency(CoalescedRequest *crequest, MachineType mach, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool success, bool isRegion)
Definition: GPUCoalescer.cc:966

Clocked::clockPeriod
Tick clockPeriod() const
Definition: clocked_object.hh:214

CoalescedRequest::getPackets
std::vector< PacketPtr > & getPackets()
Definition: GPUCoalescer.hh:114

Packet::isWrite
bool isWrite() const
Definition: packet.hh:523

GPUCoalescer::completeIssue
void completeIssue()
Definition: GPUCoalescer.cc:830

Packet::getPtr
T * getPtr()
get a pointer to the data ptr.
Definition: packet.hh:1084

Packet::isRead
bool isRead() const
Definition: packet.hh:522

std::vector
STL vector class.
Definition: stl.hh:37

GPUCoalescer::CP_TCPStTransfers
Stats::Scalar CP_TCPStTransfers
Definition: GPUCoalescer.hh:370

Packet::isAtomicOp
bool isAtomicOp() const
Definition: packet.hh:759

Profiler.hh

Packet::req
RequestPtr req
A pointer to the original request.
Definition: packet.hh:321

GPUCoalescer::Params
RubyGPUCoalescerParams Params
Definition: GPUCoalescer.hh:146

RubyPort::MemSlavePort
Definition: RubyPort.hh:75

EventBase::Progress_Event_Pri
static const Priority Progress_Event_Pri
Progress events come at the end.
Definition: eventq.hh:221

Packet::getSize
unsigned getSize() const
Definition: packet.hh:730

Clocked::cyclesToTicks
Tick cyclesToTicks(Cycles c) const
Definition: clocked_object.hh:224

type
uint8_t type
Definition: inet.hh:328

GPUCoalescer::printRequestTable
void printRequestTable(std::stringstream &ss)
Definition: GPUCoalescer.cc:284

curTick
Tick curTick()
The current simulated tick.
Definition: core.hh:44

MipsISA::pc
Bitfield< 4 > pc
Definition: pra_constants.hh:240

GPUCoalescer::printProgress
void printProgress(std::ostream &out) const
Definition: GPUCoalescer.cc:327

CoalescedRequest::setRubyType
void setRubyType(RubyRequestType type)
Definition: GPUCoalescer.hh:108

CoalescedRequest::getIssueTime
Cycles getIssueTime() const
Definition: GPUCoalescer.hh:112

GPUCoalescer
Definition: GPUCoalescer.hh:123

UncoalescedTable::instMap
std::map< uint64_t, PerInstPackets > instMap
Definition: GPUCoalescer.hh:93

CoalescedRequest::setIssueTime
void setIssueTime(Cycles _issueTime)
Definition: GPUCoalescer.hh:107

GPUCoalescer::GPU_LdMiss
Stats::Scalar GPU_LdMiss
Definition: GPUCoalescer.hh:357

RubyPort::ruby_eviction_callback
void ruby_eviction_callback(Addr address)
Definition: RubyPort.cc:598

GPUCoalescer::m_dataCache_ptr
CacheMemory * m_dataCache_ptr
Definition: GPUCoalescer.hh:318

Packet::SenderState::predecessor
SenderState * predecessor
Definition: packet.hh:399

GPUCoalescer::resetStats
void resetStats() override
Callback to reset stats.
Definition: GPUCoalescer.cc:304

RubyTester.hh

Packet::setData
void setData(const uint8_t *p)
Copy data into the packet from the provided pointer.
Definition: packet.hh:1152

GPUCoalescer::readCallback
void readCallback(Addr address, DataBlock &data)
Definition: GPUCoalescer.cc:402

RubyPort::getId
uint32_t getId()
Definition: RubyPort.hh:164

Tick
uint64_t Tick
Tick count type.
Definition: types.hh:61

SubBlock::mergeFrom
void mergeFrom(const DataBlock &data)
Definition: SubBlock.hh:60

GPUCoalescer::writeCallback
void writeCallback(Addr address, DataBlock &data)
Definition: GPUCoalescer.cc:347

GPUCoalescer::assumingRfOCoherence
bool assumingRfOCoherence
Definition: GPUCoalescer.hh:351

UncoalescedTable::getInstPackets
PerInstPackets * getInstPackets(int offset)
Definition: GPUCoalescer.cc:138

RubyPort::m_version
uint32_t m_version
Definition: RubyPort.hh:188

GPUCoalescer::GPU_TCCLdHits
Stats::Scalar GPU_TCCLdHits
Definition: GPUCoalescer.hh:356

GPUCoalescer::makeRequest
virtual RequestStatus makeRequest(PacketPtr pkt) override
Definition: GPUCoalescer.cc:580

GPUCoalescer::m_store_waiting_on_load_cycles
int m_store_waiting_on_load_cycles
Definition: GPUCoalescer.hh:343

ArmISA::len
Bitfield< 18, 16 > len
Definition: miscregs_types.hh:439

Stats::Histogram
A simple histogram stat.
Definition: statistics.hh:2626

UncoalescedTable::updateResources
void updateResources()
Definition: GPUCoalescer.cc:151

GPUCoalescer::m_outstandReqHist
Stats::Histogram m_outstandReqHist
Histogram for number of outstanding requests per cycle.
Definition: GPUCoalescer.hh:375

Packet::getAddr
Addr getAddr() const
Definition: packet.hh:720

Clocked::curCycle
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Definition: clocked_object.hh:192

GPUCoalescer::m_missTypeMachLatencyHist
std::vector< std::vector< Stats::Histogram * > > m_missTypeMachLatencyHist
Definition: GPUCoalescer.hh:389

GPUCoalescer::issueRequest
virtual void issueRequest(CoalescedRequest *crequest)
Definition: GPUCoalescer.cc:648

fatal_if
#define fatal_if(cond,...)
Conditional fatal macro that checks the supplied condition and only causes a fatal error if the condi...
Definition: logging.hh:199

RubySystem.hh

RubyPort::m_usingRubyTester
bool m_usingRubyTester
Definition: RubyPort.hh:191

EventManager::schedule
void schedule(Event &event, Tick when)
Definition: eventq.hh:998

getOffset
Addr getOffset(Addr addr)
Definition: Address.cc:48

ArmISA::ss
Bitfield< 21 > ss
Definition: miscregs_types.hh:56

std::list
STL list class.
Definition: stl.hh:51

GPUCoalescer::recordRequestType
void recordRequestType(SequencerRequestType requestType)
Definition: GPUCoalescer.cc:764

GPUCoalescer::m_InitialToForwardDelayHist
std::vector< Stats::Histogram * > m_InitialToForwardDelayHist
Definition: GPUCoalescer.hh:393

microldstop.hh

RubyPort::ruby_hit_callback
void ruby_hit_callback(PacketPtr pkt)
Definition: RubyPort.cc:426

Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition: types.hh:140

GPUCoalescer::m_load_waiting_on_load_cycles
int m_load_waiting_on_load_cycles
Definition: GPUCoalescer.hh:346

safe_cast
T safe_cast(U ptr)
Definition: cast.hh:59

Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e...
Definition: packet.hh:249

TokenSlavePort::sendTokens
void sendTokens(int num_tokens)
Return num_tokens tokens back to the master port.
Definition: token_port.cc:79

makeLineAddress
Addr makeLineAddress(Addr addr)
Definition: Address.cc:54

UncoalescedTable::coalescer
GPUCoalescer * coalescer
Definition: GPUCoalescer.hh:87

GPUCoalescer::kernelEndList
std::unordered_map< int, PacketPtr > kernelEndList
Definition: GPUCoalescer.hh:340

Clocked::clockEdge
Tick clockEdge(Cycles cycles=Cycles(0)) const
Determine the tick when a cycle begins, by default the current one, but the argument also enables the...
Definition: clocked_object.hh:174

printAddress
std::string printAddress(Addr addr)
Definition: Address.cc:73

Stats::DistBase::reset
void reset()
Reset stat value to default.
Definition: statistics.hh:1920

reqScopeToHSAScope
HSAScope reqScopeToHSAScope(const RequestPtr &req)
Definition: GPUCoalescer.cc:71

GPUCoalescer::GPU_StMiss
Stats::Scalar GPU_StMiss
Definition: GPUCoalescer.hh:362

GPUCoalescer::m_IssueToInitialDelayHist
std::vector< Stats::Histogram * > m_IssueToInitialDelayHist
Histograms for recording the breakdown of miss latency.
Definition: GPUCoalescer.hh:392

GPUCoalescer::CP_TCPLdHits
Stats::Scalar CP_TCPLdHits
Definition: GPUCoalescer.hh:364

std::deque
STL deque class.
Definition: stl.hh:44

ArmISA::j
Bitfield< 24 > j
Definition: miscregs_types.hh:54

CoalescedRequest::getRubyType
RubyRequestType getRubyType() const
Definition: GPUCoalescer.hh:113

RubyRequest.hh

Event::scheduled
bool scheduled() const
Determine if the current event is scheduled.
Definition: eventq.hh:460

CoalescedRequest::insertPacket
void insertPacket(PacketPtr pkt)
Definition: GPUCoalescer.hh:105

GPUCoalescer::m_typeLatencyHist
std::vector< Stats::Histogram * > m_typeLatencyHist
Definition: GPUCoalescer.hh:379

UncoalescedTable::insertPacket
void insertPacket(PacketPtr pkt)
Definition: GPUCoalescer.cc:122

Packet::isLLSC
bool isLLSC() const
Definition: packet.hh:548

RubyPort::m_mandatory_q_ptr
MessageBuffer * m_mandatory_q_ptr
Definition: RubyPort.hh:190

RubyTester::SenderState
Definition: RubyTester.hh:86

Stats::DataWrap::name
Derived & name(const std::string &name)
Set the name and marks this stat to print at the end of simulation.
Definition: statistics.hh:276

GPUCoalescer::recordCPReadCallBack
void recordCPReadCallBack(MachineID myMachID, MachineID senderMachID)
Definition: GPUCoalescer.cc:911

SimObject::name
virtual const std::string name() const
Definition: sim_object.hh:128

GPUCoalescer::coalescingWindow
int coalescingWindow
Definition: GPUCoalescer.hh:323

RubyPort::testDrainComplete
void testDrainComplete()
Definition: RubyPort.cc:475

ArmISA::c
Bitfield< 29 > c
Definition: miscregs_types.hh:50

GPUCoalescer.hh

GPUCoalescer::newKernelEnds
std::vector< int > newKernelEnds
Definition: GPUCoalescer.hh:341

DataBlock::clear
void clear()
Definition: DataBlock.cc:50

CoalescedRequest::getFirstPkt
PacketPtr getFirstPkt() const
Definition: GPUCoalescer.hh:111

packet.hh
Declaration of the Packet class.

GPUCoalescer::gmTokenPort
GMTokenPort gmTokenPort
Definition: GPUCoalescer.hh:401

GPUCoalescer::getRequestType
virtual RubyRequestType getRequestType(PacketPtr pkt)
Definition: GPUCoalescer.cc:550

Packet::senderState
SenderState * senderState
This packet&#39;s sender state.
Definition: packet.hh:474

GPUCoalescer::CP_TCPStHits
Stats::Scalar CP_TCPStHits
Definition: GPUCoalescer.hh:369

GPUCoalescer::m_load_waiting_on_store_cycles
int m_load_waiting_on_store_cycles
Definition: GPUCoalescer.hh:345

AbstractController::mandatoryQueueLatency
virtual Cycles mandatoryQueueLatency(const RubyRequestType &param_type)
Definition: AbstractController.hh:112

UncoalescedTable::printRequestTable
void printRequestTable(std::stringstream &ss)
Definition: GPUCoalescer.cc:164

GPUCoalescer::m_instCache_ptr
CacheMemory * m_instCache_ptr
Definition: GPUCoalescer.hh:319

GPUCoalescer::regStats
void regStats() override
Callback to set stat parameters.
Definition: GPUCoalescer.cc:1044

GPUCoalescer::m_outstanding_count
int m_outstanding_count
Definition: GPUCoalescer.hh:338

DataBlock::setData
void setData(const uint8_t *data, int offset, int len)
Definition: DataBlock.cc:108

GPUCoalescer::uncoalescedTable
UncoalescedTable uncoalescedTable
Definition: GPUCoalescer.hh:328

GPUCoalescer::hitCallback
void hitCallback(CoalescedRequest *crequest, MachineType mach, DataBlock &data, bool success, Cycles initialRequestTime, Cycles forwardRequestTime, Cycles firstResponseTime, bool isRegion)
Definition: GPUCoalescer.cc:469

Stats::DataWrap::desc
Derived & desc(const std::string &_desc)
Set the description and marks this stat to print at the end of simulation.
Definition: statistics.hh:309

MessageBuffer.hh

PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition: types.hh:235

GPUCoalescer::m_deadlock_threshold
Cycles m_deadlock_threshold
Definition: GPUCoalescer.hh:316

MachineID
Definition: MachineID.hh:38

Stats::Group::regStats
virtual void regStats()
Callback to set stat parameters.
Definition: group.cc:64

machineIDToMachineType
MachineType machineIDToMachineType(MachineID machID)
Definition: RubySlicc_ComponentMapping.hh:69

GPUCoalescer::completeHitCallback
void completeHitCallback(std::vector< PacketPtr > &mylist)
Definition: GPUCoalescer.cc:939

GPUCoalescer::print
void print(std::ostream &out) const
Definition: GPUCoalescer.cc:755

GPUCoalescer::m_runningGarnetStandalone
bool m_runningGarnetStandalone
Definition: GPUCoalescer.hh:348

GPUCoalescer::coalescedTable
std::map< Addr, std::deque< CoalescedRequest * > > coalescedTable
Definition: GPUCoalescer.hh:335

MipsISA::p
Bitfield< 0 > p
Definition: pra_constants.hh:323

GPUCoalescer::m_FirstResponseToCompletionDelayHist
std::vector< Stats::Histogram * > m_FirstResponseToCompletionDelayHist
Definition: GPUCoalescer.hh:395

GPUCoalescer::m_missTypeLatencyHist
std::vector< Stats::Histogram * > m_missTypeLatencyHist
Definition: GPUCoalescer.hh:384

GPUCoalescer::m_store_waiting_on_store_cycles
int m_store_waiting_on_store_cycles
Definition: GPUCoalescer.hh:344

data
const char data[]
Definition: circlebuf.test.cc:42

Packet::isFlush
bool isFlush() const
Definition: packet.hh:551

shader.hh

RubySystem::getBlockSizeBytes
static uint32_t getBlockSizeBytes()
Definition: RubySystem.hh:59

RubyPort
Definition: RubyPort.hh:58

MessageBuffer::enqueue
void enqueue(MsgPtr message, Tick curTime, Tick delta)
Definition: MessageBuffer.cc:162

GPUCoalescer::evictionCallback
void evictionCallback(Addr address)
Definition: GPUCoalescer.cc:866

GPUCoalescer::GPU_TCPStTransfers
Stats::Scalar GPU_TCPStTransfers
Definition: GPUCoalescer.hh:360

Stats::DistBase::sample
void sample(const U &v, int n=1)
Add a value to the distribtion n times.
Definition: statistics.hh:1896

RubyPort::SenderState::port
MemSlavePort * port
Definition: RubyPort.hh:140

DPRINTFR
#define DPRINTFR(...)
Definition: trace.hh:224

GPUCoalescer::CP_TCCLdHits
Stats::Scalar CP_TCCLdHits
Definition: GPUCoalescer.hh:366

UncoalescedTable::packetAvailable
bool packetAvailable()
Definition: GPUCoalescer.cc:132