develop/vega_2tlb__coalescer_8cc_source.html

/*

 * Copyright (c) 2021 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "arch/amdgpu/vega/tlb_coalescer.hh"


#include <cstring>


#include "arch/amdgpu/common/gpu_translation_state.hh"

#include "arch/amdgpu/vega/pagetable.hh"

#include "arch/generic/mmu.hh"

#include "base/logging.hh"

#include "debug/GPUTLB.hh"

#include "sim/process.hh"


namespace gem5

{


VegaTLBCoalescer::VegaTLBCoalescer(const VegaTLBCoalescerParams &p)

    : ClockedObject(p),

      TLBProbesPerCycle(p.probesPerCycle),

      coalescingWindow(p.coalescingWindow),

      disableCoalescing(p.disableCoalescing),

      probeTLBEvent([this]{ processProbeTLBEvent(); },

                    "Probe the TLB below",

                    false, Event::CPU_Tick_Pri),

      cleanupEvent([this]{ processCleanupEvent(); },

                   "Cleanup issuedTranslationsTable hashmap",

                   false, Event::Maximum_Pri),

      tlb_level(p.tlb_level),

      maxDownstream(p.maxDownstream),

      numDownstream(0)

{

    // create the response ports based on the number of connected ports

    for (size_t i = 0; i < p.port_cpu_side_ports_connection_count; ++i) {

        cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d", name(), i),

                                              this, i));

    }


    // create the request ports based on the number of connected ports

    for (size_t i = 0; i < p.port_mem_side_ports_connection_count; ++i) {

        memSidePort.push_back(new MemSidePort(csprintf("%s-port%d", name(), i),

                                              this, i));

    }


    default_pgSize = p.default_pgSize;

    potentialPagesize.insert(default_pgSize);

}


Port &


VegaTLBCoalescer::getPort(const std::string &if_name, PortID idx)

{

    if (if_name == "cpu_side_ports") {

        if (idx >= static_cast<PortID>(cpuSidePort.size())) {

            panic("VegaTLBCoalescer::getPort: unknown index %d\n", idx);

        }


        return *cpuSidePort[idx];

    } else  if (if_name == "mem_side_ports") {

        if (idx >= static_cast<PortID>(memSidePort.size())) {

            panic("VegaTLBCoalescer::getPort: unknown index %d\n", idx);

        }


        return *memSidePort[idx];

    } else {

        panic("VegaTLBCoalescer::getPort: unknown port %s\n", if_name);

    }

}


/*

 * This method returns true if the <incoming_pkt>

 * can be coalesced with <coalesced_pkt> and false otherwise.

 * A given set of rules is checked.

 * The rules can potentially be modified based on the TLB level.

 */

bool


VegaTLBCoalescer::canCoalesce

(PacketPtr incoming_pkt, PacketPtr coalesced_pkt,

 Addr pagebytes = VegaISA::PageBytes)

{

    if (disableCoalescing)

        return false;


    GpuTranslationState *incoming_state =

      safe_cast<GpuTranslationState*>(incoming_pkt->senderState);


    GpuTranslationState *coalesced_state =

     safe_cast<GpuTranslationState*>(coalesced_pkt->senderState);


    // Rule 1: Coalesce requests only if they

    // fall within the same virtual page

    Addr incoming_virt_page_addr = roundDown(incoming_pkt->req->getVaddr(),

                                             pagebytes);


    Addr coalesced_virt_page_addr = roundDown(coalesced_pkt->req->getVaddr(),

                                              pagebytes);


    if (incoming_virt_page_addr != coalesced_virt_page_addr)

        return false;


    //* Rule 2: Coalesce requests only if they

    // share a TLB Mode, i.e. they are both read

    // or write requests.

    BaseMMU::Mode incoming_mode = incoming_state->tlbMode;

    BaseMMU::Mode coalesced_mode = coalesced_state->tlbMode;


    if (incoming_mode != coalesced_mode)

        return false;


    // when we can coalesce a packet update the reqCnt

    // that is the number of packets represented by

    // this coalesced packet

    if (!incoming_state->isPrefetch)

        coalesced_state->reqCnt.back() += incoming_state->reqCnt.back();


    return true;

}


/*

 * We need to update the physical addresses of all the translation requests

 * that were coalesced into the one that just returned.

 */

void


VegaTLBCoalescer::updatePhysAddresses(PacketPtr pkt)

{

    GpuTranslationState *sender_state =

        safe_cast<GpuTranslationState*>(pkt->senderState);


    // Make a copy. This gets deleted after the first is sent back on the port

    assert(sender_state->tlbEntry);

    VegaISA::VegaTlbEntry tlb_entry =

        *safe_cast<VegaISA::VegaTlbEntry *>(sender_state->tlbEntry);

    Addr first_entry_vaddr = tlb_entry.vaddr;

    Addr first_entry_paddr = tlb_entry.paddr;

    int page_size = tlb_entry.size();


    potentialPagesize.insert(page_size);


    Addr virt_page_addr;


    // Find coalesced translation request.

    for (auto pgsize_seen : potentialPagesize) {

        virt_page_addr = roundDown(pkt->req->getVaddr(), pgsize_seen);

        if (issuedTranslationsTable.count(virt_page_addr) != 0)

            break;

    }


    DPRINTF(GPUTLB, "Update phys. addr. for %d \

            coalesced reqs for page %#x\n",

            issuedTranslationsTable[virt_page_addr].size(),

            virt_page_addr);


    bool uncacheable = tlb_entry.uncacheable();

    int first_hit_level = sender_state->hitLevel;

    bool is_system = pkt->req->systemReq();


    for (int i = 0;

         i < issuedTranslationsTable[virt_page_addr].size(); ++i) {

        PacketPtr local_pkt = issuedTranslationsTable[virt_page_addr][i];


        Addr local_pkt_vaddr = local_pkt->req->getVaddr();


        //check if the pending req's vaddr matches the returned page,

        //if not, reissue pending req as a 4k page

        if (!(first_entry_vaddr <= local_pkt_vaddr &&

              local_pkt_vaddr < first_entry_vaddr+page_size)) {

            reissue_pkt_helper(local_pkt);

            continue;

        }


        GpuTranslationState *sender_state =

            safe_cast<GpuTranslationState*>(local_pkt->senderState);


        // we are sending the packet back, so pop the reqCnt associated

        // with this level in the TLB hiearchy

        if (!sender_state->isPrefetch) {

            sender_state->reqCnt.pop_back();

            localCycles += curCycle();

        }


        /*

         * Only the first packet from this coalesced request has been

         * translated. Grab the translated phys. page addr and update the

         * physical addresses of the remaining packets with the appropriate

         * page offsets.

         */

        if (i) {

            Addr paddr = first_entry_paddr

                       + (local_pkt->req->getVaddr() & (page_size - 1));

            local_pkt->req->setPaddr(paddr);


            if (uncacheable)

                local_pkt->req->setFlags(Request::UNCACHEABLE);


            // update senderState->tlbEntry, so we can insert

            // the correct TLBEentry in the TLBs above.


            //auto p = sender_state->tc->getProcessPtr();

            if (sender_state->tlbEntry == NULL) {

                // not set by lower(l2) coalescer

                sender_state->tlbEntry =

                    new VegaISA::VegaTlbEntry(1 /* VMID TODO */,

                                              first_entry_vaddr,

                                              first_entry_paddr,

                                              tlb_entry.logBytes,

                                              tlb_entry.pte);

            }


            // update the hitLevel for all uncoalesced reqs

            // so that each packet knows where it hit

            // (used for statistics in the CUs)

            sender_state->hitLevel = first_hit_level;

        }


        // Copy PTE system bit information to coalesced requests

        local_pkt->req->setSystemReq(is_system);


        ResponsePort *return_port = sender_state->ports.back();

        sender_state->ports.pop_back();


        // Translation is done - Convert to a response pkt if necessary and

        // send the translation back

        if (local_pkt->isRequest()) {

            local_pkt->makeTimingResponse();

        }


        return_port->sendTimingResp(local_pkt);

    }


    // schedule clean up for end of this cycle

    // This is a maximum priority event and must be on

    // the same cycle as GPUTLB cleanup event to prevent

    // race conditions with an IssueProbeEvent caused by

    // MemSidePort::recvReqRetry

    cleanupQueue.push(virt_page_addr);


    if (!cleanupEvent.scheduled())

        schedule(cleanupEvent, curTick());

}


// re-coalesce packet to 4k pages

void


VegaTLBCoalescer::reissue_pkt_helper(PacketPtr pkt)

{

    // first packet of a coalesced request

    PacketPtr first_packet = nullptr;

    // true if we are able to do coalescing

    bool didCoalesce = false;

    // number of coalesced reqs for a given window

    int coalescedReq_cnt = 0;


    GpuTranslationState *sender_state =

        safe_cast<GpuTranslationState*>(pkt->senderState);


    DPRINTF(GPUTLB, "Trying to re-issue req at tick: %llu, addr: %#x\n",

            sender_state->issueTime, pkt->req->getVaddr());


    // The tick index is used as a key to the coalescerFIFO hashmap.

    // It is shared by all candidates that fall within the

    // given coalescingWindow.

    Tick tick_index = sender_state->issueTime / coalescingWindow;


    if (coalescerFIFO.count(tick_index)) {

        coalescedReq_cnt = coalescerFIFO[tick_index].size();

    }


    // see if we can coalesce the incoming pkt with another

    // coalesced request with the same tick_index

    for (int i = 0; i < coalescedReq_cnt; ++i) {

        first_packet = coalescerFIFO[tick_index][i].first[0];

        if (coalescerFIFO[tick_index][i].second != VegaISA::PageBytes)

            continue;


        if (canCoalesce(pkt, first_packet, VegaISA::PageBytes)) {

            coalescerFIFO[tick_index][i].first.push_back(pkt);


            DPRINTF(GPUTLB, "Coalesced re-issued req %i \

                    w/ tick_index %d has %d reqs\n",

                    i, tick_index,

                    coalescerFIFO[tick_index][i].first.size());


            didCoalesce = true;

            break;

        }

    }


    // if this is the first request for this tick_index

    // or we did not manage to coalesce, update stats

    // and make necessary allocations.

    if (!coalescedReq_cnt || !didCoalesce) {

        std::vector<PacketPtr> new_array;

        new_array.push_back(pkt);

        coalescerFIFO[tick_index].push_back(

            std::make_pair(new_array, VegaISA::PageBytes));


        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "

                "push re-issued req\n", tick_index,

                coalescerFIFO[tick_index].size());

    }


    //schedule probeTLBEvent next cycle to send the

    //coalesced requests to the TLB

    if (!probeTLBEvent.scheduled()) {

        schedule(probeTLBEvent,

                curTick() + clockPeriod());

    }

}


// Receive translation requests, create a coalesced request,

// and send them to the TLB (TLBProbesPerCycle)

bool


VegaTLBCoalescer::CpuSidePort::recvTimingReq(PacketPtr pkt)

{

    // first packet of a coalesced request

    PacketPtr first_packet = nullptr;

    // true if we are able to do coalescing

    bool didCoalesce = false;

    // number of coalesced reqs for a given window

    int coalescedReq_cnt = 0;


    GpuTranslationState *sender_state =

        safe_cast<GpuTranslationState*>(pkt->senderState);


    bool update_stats = !sender_state->isPrefetch;


    if (coalescer->tlb_level == 1 && coalescer->mustStallCUPort(this))

        return false;


    // push back the port to remember the path back

    sender_state->ports.push_back(this);


    if (update_stats) {

        // if reqCnt is empty then this packet does not represent

        // multiple uncoalesced reqs(pkts) but just a single pkt.

        // If it does though then the reqCnt for each level in the

        // hierarchy accumulates the total number of reqs this packet

        // represents

        int req_cnt = 1;


        if (!sender_state->reqCnt.empty())

            req_cnt = sender_state->reqCnt.back();


        sender_state->reqCnt.push_back(req_cnt);


        // update statistics

        coalescer->uncoalescedAccesses++;

        req_cnt = sender_state->reqCnt.back();

        DPRINTF(GPUTLB, "receiving pkt w/ req_cnt %d\n", req_cnt);

        coalescer->queuingCycles -= (coalescer->curCycle() * req_cnt);

        coalescer->localqueuingCycles -= coalescer->curCycle();

        coalescer->localCycles -= coalescer->curCycle();

    }


    // Coalesce based on the time the packet arrives at the coalescer (here).

    if (!sender_state->issueTime)

        sender_state->issueTime = curTick();


    // The tick index is used as a key to the coalescerFIFO hashmap.

    // It is shared by all candidates that fall within the

    // given coalescingWindow.

    Tick tick_index = sender_state->issueTime / coalescer->coalescingWindow;


    if (coalescer->coalescerFIFO.count(tick_index)) {

        coalescedReq_cnt = coalescer->coalescerFIFO[tick_index].size();

    }


    // see if we can coalesce the incoming pkt with another

    // coalesced request with the same tick_index

    for (int i = 0; i < coalescedReq_cnt; ++i) {

        first_packet = coalescer->coalescerFIFO[tick_index][i].first[0];

        Addr pg_size = coalescer->coalescerFIFO[tick_index][i].second;


        if (coalescer->canCoalesce(pkt, first_packet, pg_size)) {

            coalescer->coalescerFIFO[tick_index][i].first.push_back(pkt);


            DPRINTF(GPUTLB, "Coalesced req %i w/ tick_index %d has %d reqs\n",

                    i, tick_index,

                    coalescer->coalescerFIFO[tick_index][i].first.size());


            didCoalesce = true;

            break;

        }

    }


    // if this is the first request for this tick_index

    // or we did not manage to coalesce, update stats

    // and make necessary allocations.

    if (!coalescedReq_cnt || !didCoalesce) {

        if (update_stats)

            coalescer->coalescedAccesses++;


        std::vector<PacketPtr> new_array;

        new_array.push_back(pkt);

        coalescer->coalescerFIFO[tick_index].push_back

            (std::make_pair(new_array, coalescer->default_pgSize));


        DPRINTF(GPUTLB, "coalescerFIFO[%d] now has %d coalesced reqs after "

                "push\n", tick_index,

                coalescer->coalescerFIFO[tick_index].size());

    }


    //schedule probeTLBEvent next cycle to send the

    //coalesced requests to the TLB

    if (!coalescer->probeTLBEvent.scheduled()) {

        coalescer->schedule(coalescer->probeTLBEvent,

                curTick() + coalescer->clockPeriod());

    }


    return true;

}


void


VegaTLBCoalescer::CpuSidePort::recvReqRetry()

{

    panic("recvReqRetry called");

}


void


VegaTLBCoalescer::CpuSidePort::recvFunctional(PacketPtr pkt)

{


    GpuTranslationState *sender_state =

        safe_cast<GpuTranslationState*>(pkt->senderState);


    bool update_stats = !sender_state->isPrefetch;


    if (update_stats)

        coalescer->uncoalescedAccesses++;


    Addr virt_page_addr = roundDown(pkt->req->getVaddr(), VegaISA::PageBytes);

    int map_count = coalescer->issuedTranslationsTable.count(virt_page_addr);


    if (map_count) {

        DPRINTF(GPUTLB, "Warning! Functional access to addr %#x sees timing "

                "req. pending\n", virt_page_addr);

    }


    coalescer->memSidePort[0]->sendFunctional(pkt);

}


AddrRangeList


VegaTLBCoalescer::CpuSidePort::getAddrRanges() const

{

    // currently not checked by the requestor

    AddrRangeList ranges;


    return ranges;

}


/*

 *  a translation completed and returned

 */

bool


VegaTLBCoalescer::MemSidePort::recvTimingResp(PacketPtr pkt)

{

    coalescer->updatePhysAddresses(pkt);


    if (coalescer->tlb_level != 1)

        return true;


    coalescer->decrementNumDownstream();


    DPRINTF(GPUTLB,

            "recvTimingReq: clscr = %p, numDownstream = %d, max = %d\n",

            coalescer, coalescer->numDownstream, coalescer->maxDownstream);


    coalescer->unstallPorts();

    return true;

}


void


VegaTLBCoalescer::MemSidePort::recvReqRetry()

{

    //we've receeived a retry. Schedule a probeTLBEvent

    if (!coalescer->probeTLBEvent.scheduled())

        coalescer->schedule(coalescer->probeTLBEvent,

                curTick() + coalescer->clockPeriod());

}


void


VegaTLBCoalescer::MemSidePort::recvFunctional(PacketPtr pkt)

{

    fatal("Memory side recvFunctional() not implemented in TLB coalescer.\n");

}


/*

 * Here we scan the coalescer FIFO and issue the max

 * number of permitted probes to the TLB below. We

 * permit bypassing of coalesced requests for the same

 * tick_index.

 *

 * We do not access the next tick_index unless we've

 * drained the previous one. The coalesced requests

 * that are successfully sent are moved to the

 * issuedTranslationsTable table (the table which keeps

 * track of the outstanding reqs)

 */

void


VegaTLBCoalescer::processProbeTLBEvent()

{

    // number of TLB probes sent so far

    int sent_probes = 0;


    // It is set to true either when the recvTiming of the TLB below

    // returns false or when there is another outstanding request for the

    // same virt. page.


    DPRINTF(GPUTLB, "triggered VegaTLBCoalescer %s\n", __func__);


    if ((tlb_level == 1)

        && (availDownstreamSlots() == 0)) {

        DPRINTF(GPUTLB, "IssueProbeEvent - no downstream slots, bail out\n");

        return;

    }


    for (auto iter = coalescerFIFO.begin();

         iter != coalescerFIFO.end();) {

        int coalescedReq_cnt = iter->second.size();

        int i = 0;

        int vector_index = 0;


        DPRINTF(GPUTLB, "coalescedReq_cnt is %d for tick_index %d\n",

               coalescedReq_cnt, iter->first);


        while (i < coalescedReq_cnt) {

            ++i;

            PacketPtr first_packet = iter->second[vector_index].first[0];

            //The request to coalescer is origanized as follows.

            //The coalescerFIFO is a map which is indexed by coalescingWindow

            // cycle. Only requests that falls in the same coalescingWindow

            // considered for coalescing. Each entry of a coalescerFIFO is a

            // vector of vectors. There is one entry for each different virtual

            // page number and it contains vector of all request that are

            // coalesced for the same virtual page address


            // compute virtual page address for this request use the assumed

            // page size, stored in pair.second of the coalesced req

            Addr virt_page_addr = roundDown(first_packet->req->getVaddr(),

                    iter->second[vector_index].second);


            // is there another outstanding request for the same page addr?

            // consider all possible page size

            int pending_reqs = 0;

            for (auto i_pgsize : potentialPagesize)

                pending_reqs += issuedTranslationsTable.count

                    (roundDown(first_packet->req->getVaddr(), i_pgsize));


            if (pending_reqs) {

                DPRINTF(GPUTLB, "Cannot issue - There are pending reqs for "

                        "page %#x\n", virt_page_addr);


                ++vector_index;

                continue;

            }


            // send the coalesced request for virt_page_addr

            if (!memSidePort[0]->sendTimingReq(first_packet)) {

                DPRINTF(GPUTLB,

                        "Failed to send TLB request for page %#x",

                        virt_page_addr);


                // No need for a retries queue since we are already

                // buffering the coalesced request in coalescerFIFO.

                // Arka:: No point trying to send other requests to TLB at

                // this point since it is busy. Retries will be called later

                // by the TLB below

                return;

             } else {


                if (tlb_level == 1)

                    incrementNumDownstream();


                GpuTranslationState *tmp_sender_state =

                    safe_cast<GpuTranslationState*>(first_packet->senderState);


                bool update_stats = !tmp_sender_state->isPrefetch;


                if (update_stats) {

                    // req_cnt is total number of packets represented

                    // by the one we just sent counting all the way from

                    // the top of TLB hiearchy (i.e., from the CU)

                    int req_cnt = tmp_sender_state->reqCnt.back();

                    queuingCycles += (curCycle() * req_cnt);


                    DPRINTF(GPUTLB, "%s sending pkt w/ req_cnt %d\n",

                            name(), req_cnt);


                    // pkt_cnt is number of packets we coalesced into the one

                    // we just sent but only at this coalescer level

                    int pkt_cnt = iter->second[vector_index].first.size();

                    localqueuingCycles += (curCycle() * pkt_cnt);

                }


                DPRINTF(GPUTLB, "Successfully sent TLB request for page %#x\n",

                       virt_page_addr);


                //copy coalescedReq to issuedTranslationsTable

                issuedTranslationsTable[virt_page_addr]

                    = iter->second[vector_index].first;


                //erase the entry of this coalesced req

                iter->second.erase(iter->second.begin() + vector_index);


                if (iter->second.empty())

                    assert( i == coalescedReq_cnt );


                sent_probes++;


                if (sent_probes == TLBProbesPerCycle ||

                    ((tlb_level == 1) && (!availDownstreamSlots()))) {

                    //Before returning make sure that empty vectors are taken

                    // out. Not a big issue though since a later invocation

                    // will take it out anyway.

                    if (iter->second.empty())

                        coalescerFIFO.erase(iter);


                    //schedule probeTLBEvent next cycle to send the

                    //coalesced requests to the TLB

                    if (!probeTLBEvent.scheduled()) {

                        schedule(probeTLBEvent,

                                 cyclesToTicks(curCycle() + Cycles(1)));

                    }

                    return;

                }

            }

        }


        //if there are no more coalesced reqs for this tick_index

        //erase the hash_map with the first iterator

        if (iter->second.empty()) {

            coalescerFIFO.erase(iter++);

        } else {

            ++iter;

        }

    }

}


void


VegaTLBCoalescer::processCleanupEvent()

{

    while (!cleanupQueue.empty()) {

        Addr cleanup_addr = cleanupQueue.front();

        cleanupQueue.pop();

        issuedTranslationsTable.erase(cleanup_addr);


        DPRINTF(GPUTLB, "Cleanup - Delete coalescer entry with key %#x\n",

                cleanup_addr);

    }

}


void


VegaTLBCoalescer::regStats()

{

    ClockedObject::regStats();


    uncoalescedAccesses

        .name(name() + ".uncoalesced_accesses")

        .desc("Number of uncoalesced TLB accesses")

        ;


    coalescedAccesses

        .name(name() + ".coalesced_accesses")

        .desc("Number of coalesced TLB accesses")

        ;


    queuingCycles

        .name(name() + ".queuing_cycles")

        .desc("Number of cycles spent in queue")

        ;


    localqueuingCycles

        .name(name() + ".local_queuing_cycles")

        .desc("Number of cycles spent in queue for all incoming reqs")

        ;


   localCycles

        .name(name() + ".local_cycles")

        .desc("Number of cycles spent in queue for all incoming reqs")

        ;


    localLatency

        .name(name() + ".local_latency")

        .desc("Avg. latency over all incoming pkts")

        ;


    latency

        .name(name() + ".latency")

        .desc("Avg. latency over all incoming pkts")

        ;


    localLatency = localqueuingCycles / uncoalescedAccesses;

    latency = localCycles / uncoalescedAccesses;

}


void


VegaTLBCoalescer::insertStalledPortIfNotMapped(CpuSidePort *port)

{

    assert(tlb_level == 1);

    if (stalledPortsMap.count(port) != 0)

        return; // we already know this port is stalled


    stalledPortsMap[port] = port;

    stalledPortsQueue.push(port);

    DPRINTF(GPUTLB,

            "insertStalledPortIfNotMapped: port %p, mapSz = %d, qsz = %d\n",

            port, stalledPortsMap.size(), stalledPortsQueue.size());

}


bool


VegaTLBCoalescer::mustStallCUPort(CpuSidePort *port)

{

    assert(tlb_level == 1);


    DPRINTF(GPUTLB, "mustStallCUPort: downstream = %d, max = %d\n",

            numDownstream, maxDownstream);


    if (availDownstreamSlots() == 0 || numDownstream == maxDownstream) {

        warn("RED ALERT - VegaTLBCoalescer::mustStallCUPort\n");

        insertStalledPortIfNotMapped(port);

        return true;

    }

    else

        return false;

}


void


VegaTLBCoalescer::unstallPorts()

{

    assert(tlb_level == 1);

    if (!stalledPorts() || availDownstreamSlots() == 0)

        return;


    DPRINTF(GPUTLB, "unstallPorts()\n");

    /*

     * this check is needed because we can be called from recvTiiningResponse()

     * or, synchronously due to having called sendRetry, from recvTimingReq()

     */

    if (availDownstreamSlots() == 0) // can happen if retry sent 1 downstream

        return;

    /*

     *  Consider this scenario

     *        1) max downstream is reached

     *        2) port1 tries to send a req, cant => stalledPortsQueue = [port1]

     *        3) port2 tries to send a req, cant => stalledPortsQueue = [port1,

     *              port2]

     *        4) a request completes and we remove port1 from both data

     *              structures & call

     *             sendRetry => stalledPortsQueue = [port2]

     *        5) port1 sends one req downstream and a second is rejected

     *             => stalledPortsQueue = [port2, port1]

     *

     *        so we round robin and each stalled port can send 1 req on retry

     */

    assert(availDownstreamSlots() == 1);

    auto port = stalledPortsQueue.front();

    DPRINTF(GPUTLB, "sending retry for port = %p(%s)\n", port, port->name());

    stalledPortsQueue.pop();

    auto iter = stalledPortsMap.find(port);

    assert(iter != stalledPortsMap.end());

    stalledPortsMap.erase(iter);

    port->sendRetryReq(); // cu will synchronously call recvTimingReq

}


} // namespace gem5

pagetable.hh

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

gem5::BaseMMU::Mode
Mode
Definition mmu.hh:56

gem5::ClockedObject::ClockedObject
ClockedObject(const ClockedObjectParams &p)
Definition clocked_object.cc:46

gem5::Clocked::curCycle
Cycles curCycle() const
Determine the current cycle, corresponding to a tick aligned to a clock edge.
Definition clocked_object.hh:195

gem5::Clocked::cyclesToTicks
Tick cyclesToTicks(Cycles c) const
Definition clocked_object.hh:227

gem5::Clocked::clockPeriod
Tick clockPeriod() const
Definition clocked_object.hh:217

gem5::Cycles
Cycles is a wrapper class for representing cycle counts, i.e.
Definition types.hh:79

gem5::Named::name
virtual std::string name() const
Definition named.hh:60

gem5::Packet::makeTimingResponse
void makeTimingResponse()
Definition packet.hh:1080

gem5::Packet::senderState
SenderState * senderState
This packet's sender state.
Definition packet.hh:545

gem5::Packet::req
RequestPtr req
A pointer to the original request.
Definition packet.hh:377

gem5::Packet::isRequest
bool isRequest() const
Definition packet.hh:597

gem5::Port
Ports are used to interface objects to each other.
Definition port.hh:62

gem5::Request::UNCACHEABLE
@ UNCACHEABLE
The request is to an uncacheable address.
Definition request.hh:125

gem5::ResponsePort
A ResponsePort is a specialization of a port.
Definition port.hh:349

gem5::ResponsePort::sendTimingResp
bool sendTimingResp(PacketPtr pkt)
Attempt to send a timing response to the request port by calling its corresponding receive function.
Definition port.hh:454

gem5::VegaTLBCoalescer::CpuSidePort
Definition tlb_coalescer.hh:140

gem5::VegaTLBCoalescer::CpuSidePort::recvReqRetry
virtual void recvReqRetry()
Definition tlb_coalescer.cc:440

gem5::VegaTLBCoalescer::CpuSidePort::getAddrRanges
virtual AddrRangeList getAddrRanges() const
Get a list of the non-overlapping address ranges the owner is responsible for.
Definition tlb_coalescer.cc:469

gem5::VegaTLBCoalescer::CpuSidePort::coalescer
VegaTLBCoalescer * coalescer
Definition tlb_coalescer.hh:148

gem5::VegaTLBCoalescer::CpuSidePort::recvFunctional
virtual void recvFunctional(PacketPtr pkt)
Receive a functional request packet from the peer.
Definition tlb_coalescer.cc:446

gem5::VegaTLBCoalescer::CpuSidePort::recvTimingReq
virtual bool recvTimingReq(PacketPtr pkt)
Receive a timing request from the peer.
Definition tlb_coalescer.cc:339

gem5::VegaTLBCoalescer::MemSidePort::recvReqRetry
virtual void recvReqRetry()
Called by the peer if sendTimingReq was called on this peer (causing recvTimingReq to be called on th...
Definition tlb_coalescer.cc:500

gem5::VegaTLBCoalescer::MemSidePort::recvFunctional
virtual void recvFunctional(PacketPtr pkt)
Definition tlb_coalescer.cc:509

gem5::VegaTLBCoalescer::MemSidePort::recvTimingResp
virtual bool recvTimingResp(PacketPtr pkt)
Receive a timing response from the peer.
Definition tlb_coalescer.cc:481

gem5::VegaTLBCoalescer::MemSidePort::coalescer
VegaTLBCoalescer * coalescer
Definition tlb_coalescer.hh:178

gem5::VegaTLBCoalescer::localqueuingCycles
statistics::Scalar localqueuingCycles
Definition tlb_coalescer.hh:128

gem5::VegaTLBCoalescer::coalescedAccesses
statistics::Scalar coalescedAccesses
Definition tlb_coalescer.hh:118

gem5::VegaTLBCoalescer::processCleanupEvent
void processCleanupEvent()
Definition tlb_coalescer.cc:667

gem5::VegaTLBCoalescer::issuedTranslationsTable
CoalescingTable issuedTranslationsTable
Definition tlb_coalescer.hh:113

gem5::VegaTLBCoalescer::updatePhysAddresses
void updatePhysAddresses(PacketPtr pkt)
Definition tlb_coalescer.cc:151

gem5::VegaTLBCoalescer::tlb_level
int tlb_level
Definition tlb_coalescer.hh:216

gem5::VegaTLBCoalescer::disableCoalescing
bool disableCoalescing
Definition tlb_coalescer.hh:79

gem5::VegaTLBCoalescer::queuingCycles
statistics::Scalar queuingCycles
Definition tlb_coalescer.hh:123

gem5::VegaTLBCoalescer::cleanupEvent
EventFunctionWrapper cleanupEvent
The cleanupEvent is scheduled after a TLBEvent triggers in order to free memory and do the required c...
Definition tlb_coalescer.hh:209

gem5::VegaTLBCoalescer::stalledPortsQueue
std::queue< CpuSidePort * > stalledPortsQueue
Definition tlb_coalescer.hh:220

gem5::VegaTLBCoalescer::unstallPorts
void unstallPorts()
Definition tlb_coalescer.cc:755

gem5::VegaTLBCoalescer::canCoalesce
bool canCoalesce(PacketPtr pkt1, PacketPtr pkt2, Addr pagebytes)
Definition tlb_coalescer.cc:105

gem5::VegaTLBCoalescer::potentialPagesize
std::set< Addr > potentialPagesize
Definition tlb_coalescer.hh:214

gem5::VegaTLBCoalescer::incrementNumDownstream
void incrementNumDownstream()
Definition tlb_coalescer.hh:243

gem5::VegaTLBCoalescer::coalescingWindow
int coalescingWindow
Definition tlb_coalescer.hh:72

gem5::VegaTLBCoalescer::TLBProbesPerCycle
int TLBProbesPerCycle
Definition tlb_coalescer.hh:68

gem5::VegaTLBCoalescer::uncoalescedAccesses
statistics::Scalar uncoalescedAccesses
Definition tlb_coalescer.hh:116

gem5::VegaTLBCoalescer::maxDownstream
int maxDownstream
Definition tlb_coalescer.hh:217

gem5::VegaTLBCoalescer::reissue_pkt_helper
void reissue_pkt_helper(PacketPtr pkt)
Definition tlb_coalescer.cc:270

gem5::VegaTLBCoalescer::getPort
Port & getPort(const std::string &if_name, PortID idx=InvalidPortID) override
Get a port with a given name and index.
Definition tlb_coalescer.cc:78

gem5::VegaTLBCoalescer::cpuSidePort
std::vector< CpuSidePort * > cpuSidePort
Definition tlb_coalescer.hh:195

gem5::VegaTLBCoalescer::memSidePort
std::vector< MemSidePort * > memSidePort
Definition tlb_coalescer.hh:197

gem5::VegaTLBCoalescer::probeTLBEvent
EventFunctionWrapper probeTLBEvent
This event issues the TLB probes.
Definition tlb_coalescer.hh:204

gem5::VegaTLBCoalescer::VegaTLBCoalescer
VegaTLBCoalescer(const VegaTLBCoalescerParams &p)
Definition tlb_coalescer.cc:46

gem5::VegaTLBCoalescer::insertStalledPortIfNotMapped
void insertStalledPortIfNotMapped(CpuSidePort *)
Definition tlb_coalescer.cc:724

gem5::VegaTLBCoalescer::coalescerFIFO
CoalescingFIFO coalescerFIFO
Definition tlb_coalescer.hh:100

gem5::VegaTLBCoalescer::stalledPortsMap
std::map< CpuSidePort *, CpuSidePort * > stalledPortsMap
Definition tlb_coalescer.hh:222

gem5::VegaTLBCoalescer::regStats
void regStats() override
Callback to set stat parameters.
Definition tlb_coalescer.cc:680

gem5::VegaTLBCoalescer::localLatency
statistics::Formula localLatency
Definition tlb_coalescer.hh:131

gem5::VegaTLBCoalescer::availDownstreamSlots
unsigned int availDownstreamSlots()
Definition tlb_coalescer.hh:224

gem5::VegaTLBCoalescer::cleanupQueue
std::queue< Addr > cleanupQueue
Definition tlb_coalescer.hh:254

gem5::VegaTLBCoalescer::mustStallCUPort
bool mustStallCUPort(CpuSidePort *)
Definition tlb_coalescer.cc:738

gem5::VegaTLBCoalescer::localCycles
statistics::Scalar localCycles
Definition tlb_coalescer.hh:129

gem5::VegaTLBCoalescer::numDownstream
unsigned int numDownstream
Definition tlb_coalescer.hh:218

gem5::VegaTLBCoalescer::stalledPorts
bool stalledPorts()
Definition tlb_coalescer.hh:232

gem5::VegaTLBCoalescer::processProbeTLBEvent
void processProbeTLBEvent()
Definition tlb_coalescer.cc:527

gem5::VegaTLBCoalescer::latency
statistics::Formula latency
Definition tlb_coalescer.hh:133

std::vector
STL vector class.
Definition stl.hh:37

mmu.hh

gpu_translation_state.hh

gem5::AddrRangeList
std::list< AddrRange > AddrRangeList
Convenience typedef for a collection of address ranges.
Definition addr_range.hh:64

gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279

gem5::EventBase::Maximum_Pri
static const Priority Maximum_Pri
Maximum priority.
Definition eventq.hh:244

gem5::EventManager::schedule
void schedule(Event &event, Tick when)
Definition eventq.hh:1012

gem5::EventBase::CPU_Tick_Pri
static const Priority CPU_Tick_Pri
CPU ticks must come after other associated CPU events (such as writebacks).
Definition eventq.hh:207

panic
#define panic(...)
This implements a cprintf based panic() function.
Definition logging.hh:220

fatal
#define fatal(...)
This implements a cprintf based fatal() function.
Definition logging.hh:232

gem5::statistics::Group::regStats
virtual void regStats()
Callback to set stat parameters.
Definition group.cc:68

logging.hh

warn
#define warn(...)
Definition logging.hh:288

gem5::ArmISA::i
Bitfield< 7 > i
Definition misc_types.hh:67

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::VegaISA::PageBytes
const Addr PageBytes
Definition page_size.hh:42

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::safe_cast
T safe_cast(U &&ref_or_ptr)
Definition cast.hh:74

gem5::curTick
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::PortID
int16_t PortID
Port index/ID type, and a symbolic name for an invalid port id.
Definition types.hh:245

gem5::Tick
uint64_t Tick
Tick count type.
Definition types.hh:58

gem5::PacketPtr
Packet * PacketPtr
Definition thread_context.hh:70

gem5::csprintf
std::string csprintf(const char *format, const Args &...args)
Definition cprintf.hh:161

process.hh

gem5::GpuTranslationState
GPU TranslationState: this currently is a somewhat bastardization of the usage of SenderState,...
Definition gpu_translation_state.hh:59

gem5::GpuTranslationState::tlbMode
BaseMMU::Mode tlbMode
Definition gpu_translation_state.hh:61

gem5::GpuTranslationState::issueTime
uint64_t issueTime
Definition gpu_translation_state.hh:77

gem5::GpuTranslationState::ports
std::vector< ResponsePort * > ports
Definition gpu_translation_state.hh:79

gem5::GpuTranslationState::reqCnt
std::vector< int > reqCnt
Definition gpu_translation_state.hh:83

gem5::GpuTranslationState::isPrefetch
bool isPrefetch
Definition gpu_translation_state.hh:75

gem5::GpuTranslationState::hitLevel
int hitLevel
Definition gpu_translation_state.hh:85

gem5::GpuTranslationState::tlbEntry
Serializable * tlbEntry
Definition gpu_translation_state.hh:73

name
const std::string & name()
Definition trace.cc:48

tlb_coalescer.hh