release/v20-1-0-0/gpu__tlb_8cc_source.html

/*

 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * For use for simulation and test purposes only

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its contributors

 * may be used to endorse or promote products derived from this software

 * without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 *

 * Author: Lisa Hsu

 */


#include "gpu-compute/gpu_tlb.hh"


#include <cmath>

#include <cstring>


#include "arch/x86/faults.hh"

#include "arch/x86/insts/microldstop.hh"

#include "arch/x86/isa_traits.hh"

#include "arch/x86/pagetable.hh"

#include "arch/x86/pagetable_walker.hh"

#include "arch/x86/regs/misc.hh"

#include "arch/x86/regs/msr.hh"

#include "arch/x86/x86_traits.hh"

#include "base/bitfield.hh"

#include "base/logging.hh"

#include "base/output.hh"

#include "base/trace.hh"

#include "cpu/base.hh"

#include "cpu/thread_context.hh"

#include "debug/GPUPrefetch.hh"

#include "debug/GPUTLB.hh"

#include "mem/packet_access.hh"

#include "mem/page_table.hh"

#include "mem/request.hh"

#include "sim/process.hh"

#include "sim/pseudo_inst.hh"


namespace X86ISA

{


    GpuTLB::GpuTLB(const Params *p)

        : ClockedObject(p), configAddress(0), size(p->size),

          cleanupEvent([this]{ cleanup(); }, name(), false,

                       Event::Maximum_Pri),

          exitEvent([this]{ exitCallback(); }, name())

    {

        assoc = p->assoc;

        assert(assoc <= size);

        numSets = size/assoc;

        allocationPolicy = p->allocationPolicy;

        hasMemSidePort = false;

        accessDistance = p->accessDistance;


        tlb.assign(size, TlbEntry());


        freeList.resize(numSets);

        entryList.resize(numSets);


        for (int set = 0; set < numSets; ++set) {

            for (int way = 0; way < assoc; ++way) {

                int x = set * assoc + way;

                freeList[set].push_back(&tlb.at(x));

            }

        }


        FA = (size == assoc);


        setMask = numSets - 1;


        maxCoalescedReqs = p->maxOutstandingReqs;


        // Do not allow maxCoalescedReqs to be more than the TLB associativity

        if (maxCoalescedReqs > assoc) {

            maxCoalescedReqs = assoc;

            cprintf("Forcing maxCoalescedReqs to %d (TLB assoc.) \n", assoc);

        }


        outstandingReqs = 0;

        hitLatency = p->hitLatency;

        missLatency1 = p->missLatency1;

        missLatency2 = p->missLatency2;


        // create the response ports based on the number of connected ports

        for (size_t i = 0; i < p->port_cpu_side_ports_connection_count; ++i) {

            cpuSidePort.push_back(new CpuSidePort(csprintf("%s-port%d",

                                  name(), i), this, i));

        }


        // create the request ports based on the number of connected ports

        for (size_t i = 0; i < p->port_mem_side_ports_connection_count; ++i) {

            memSidePort.push_back(new MemSidePort(csprintf("%s-port%d",

                                  name(), i), this, i));

        }

    }


    // fixme: this is never called?

    GpuTLB::~GpuTLB()

    {

        // make sure all the hash-maps are empty

        assert(translationReturnEvent.empty());

    }


    Port &

    GpuTLB::getPort(const std::string &if_name, PortID idx)

    {

        if (if_name == "cpu_side_ports") {

            if (idx >= static_cast<PortID>(cpuSidePort.size())) {

                panic("TLBCoalescer::getPort: unknown index %d\n", idx);

            }


            return *cpuSidePort[idx];

        } else if (if_name == "mem_side_ports") {

            if (idx >= static_cast<PortID>(memSidePort.size())) {

                panic("TLBCoalescer::getPort: unknown index %d\n", idx);

            }


            hasMemSidePort = true;


            return *memSidePort[idx];

        } else {

            panic("TLBCoalescer::getPort: unknown port %s\n", if_name);

        }

    }


    TlbEntry*

    GpuTLB::insert(Addr vpn, TlbEntry &entry)

    {

        TlbEntry *newEntry = nullptr;


        int set = (vpn >> TheISA::PageShift) & setMask;


        if (!freeList[set].empty()) {

            newEntry = freeList[set].front();

            freeList[set].pop_front();

        } else {

            newEntry = entryList[set].back();

            entryList[set].pop_back();

        }


        *newEntry = entry;

        newEntry->vaddr = vpn;

        entryList[set].push_front(newEntry);


        return newEntry;

    }


    GpuTLB::EntryList::iterator

    GpuTLB::lookupIt(Addr va, bool update_lru)

    {

        int set = (va >> TheISA::PageShift) & setMask;


        if (FA) {

            assert(!set);

        }


        auto entry = entryList[set].begin();

        for (; entry != entryList[set].end(); ++entry) {

            int page_size = (*entry)->size();


            if ((*entry)->vaddr <= va && (*entry)->vaddr + page_size > va) {

                DPRINTF(GPUTLB, "Matched vaddr %#x to entry starting at %#x "

                        "with size %#x.\n", va, (*entry)->vaddr, page_size);


                if (update_lru) {

                    entryList[set].push_front(*entry);

                    entryList[set].erase(entry);

                    entry = entryList[set].begin();

                }


                break;

            }

        }


        return entry;

    }


    TlbEntry*

    GpuTLB::lookup(Addr va, bool update_lru)

    {

        int set = (va >> TheISA::PageShift) & setMask;


        auto entry = lookupIt(va, update_lru);


        if (entry == entryList[set].end())

            return nullptr;

        else

            return *entry;

    }


    void

    GpuTLB::invalidateAll()

    {

        DPRINTF(GPUTLB, "Invalidating all entries.\n");


        for (int i = 0; i < numSets; ++i) {

            while (!entryList[i].empty()) {

                TlbEntry *entry = entryList[i].front();

                entryList[i].pop_front();

                freeList[i].push_back(entry);

            }

        }

    }


    void

    GpuTLB::setConfigAddress(uint32_t addr)

    {

        configAddress = addr;

    }


    void

    GpuTLB::invalidateNonGlobal()

    {

        DPRINTF(GPUTLB, "Invalidating all non global entries.\n");


        for (int i = 0; i < numSets; ++i) {

            for (auto entryIt = entryList[i].begin();

                 entryIt != entryList[i].end();) {

                if (!(*entryIt)->global) {

                    freeList[i].push_back(*entryIt);

                    entryList[i].erase(entryIt++);

                } else {

                    ++entryIt;

                }

            }

        }

    }


    void

    GpuTLB::demapPage(Addr va, uint64_t asn)

    {


        int set = (va >> TheISA::PageShift) & setMask;

        auto entry = lookupIt(va, false);


        if (entry != entryList[set].end()) {

            freeList[set].push_back(*entry);

            entryList[set].erase(entry);

        }

    }


    namespace

    {


    Cycles

    localMiscRegAccess(bool read, MiscRegIndex regNum,

                       ThreadContext *tc, PacketPtr pkt)

    {

        if (read) {

            RegVal data = htole(tc->readMiscReg(regNum));

            // Make sure we don't trot off the end of data.

            pkt->setData((uint8_t *)&data);

        } else {

            RegVal data = htole(tc->readMiscRegNoEffect(regNum));

            tc->setMiscReg(regNum, letoh(data));

        }

        return Cycles(1);

    }


    } // anonymous namespace


    Fault

    GpuTLB::translateInt(bool read, const RequestPtr &req, ThreadContext *tc)

    {

        DPRINTF(GPUTLB, "Addresses references internal memory.\n");

        Addr vaddr = req->getVaddr();

        Addr prefix = (vaddr >> 3) & IntAddrPrefixMask;


        if (prefix == IntAddrPrefixCPUID) {

            panic("CPUID memory space not yet implemented!\n");

        } else if (prefix == IntAddrPrefixMSR) {

            vaddr = (vaddr >> 3) & ~IntAddrPrefixMask;


            MiscRegIndex regNum;

            if (!msrAddrToIndex(regNum, vaddr))

                return std::make_shared<GeneralProtection>(0);


            req->setLocalAccessor(

                [read,regNum](ThreadContext *tc, PacketPtr pkt)

                {

                    return localMiscRegAccess(read, regNum, tc, pkt);

                }

            );


            return NoFault;

        } else if (prefix == IntAddrPrefixIO) {

            // TODO If CPL > IOPL or in virtual mode, check the I/O permission

            // bitmap in the TSS.


            Addr IOPort = vaddr & ~IntAddrPrefixMask;

            // Make sure the address fits in the expected 16 bit IO address

            // space.

            assert(!(IOPort & ~0xFFFF));

            if (IOPort == 0xCF8 && req->getSize() == 4) {

                req->setLocalAccessor(

                    [read](ThreadContext *tc, PacketPtr pkt)

                    {

                        return localMiscRegAccess(

                                read, MISCREG_PCI_CONFIG_ADDRESS, tc, pkt);

                    }

                );

            } else if ((IOPort & ~mask(2)) == 0xCFC) {

                req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);

                Addr configAddress =

                    tc->readMiscRegNoEffect(MISCREG_PCI_CONFIG_ADDRESS);

                if (bits(configAddress, 31, 31)) {

                    req->setPaddr(PhysAddrPrefixPciConfig |

                            mbits(configAddress, 30, 2) |

                            (IOPort & mask(2)));

                } else {

                    req->setPaddr(PhysAddrPrefixIO | IOPort);

                }

            } else {

                req->setFlags(Request::UNCACHEABLE | Request::STRICT_ORDER);

                req->setPaddr(PhysAddrPrefixIO | IOPort);

            }

            return NoFault;

        } else {

            panic("Access to unrecognized internal address space %#x.\n",

                  prefix);

        }

    }


    bool

    GpuTLB::tlbLookup(const RequestPtr &req,

                      ThreadContext *tc, bool update_stats)

    {

        bool tlb_hit = false;

    #ifndef NDEBUG

        uint32_t flags = req->getFlags();

        int seg = flags & SegmentFlagMask;

    #endif


        assert(seg != SEGMENT_REG_MS);

        Addr vaddr = req->getVaddr();

        DPRINTF(GPUTLB, "TLB Lookup for vaddr %#x.\n", vaddr);

        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);


        if (m5Reg.prot) {

            DPRINTF(GPUTLB, "In protected mode.\n");

            // make sure we are in 64-bit mode

            assert(m5Reg.mode == LongMode);


            // If paging is enabled, do the translation.

            if (m5Reg.paging) {

                DPRINTF(GPUTLB, "Paging enabled.\n");

                //update LRU stack on a hit

                TlbEntry *entry = lookup(vaddr, true);


                if (entry)

                    tlb_hit = true;


                if (!update_stats) {

                    // functional tlb access for memory initialization

                    // i.e., memory seeding or instr. seeding -> don't update

                    // TLB and stats

                    return tlb_hit;

                }


                localNumTLBAccesses++;


                if (!entry) {

                    localNumTLBMisses++;

                } else {

                    localNumTLBHits++;

                }

            }

        }


        return tlb_hit;

    }


    Fault

    GpuTLB::translate(const RequestPtr &req, ThreadContext *tc,

                      Translation *translation, Mode mode,

                      bool &delayedResponse, bool timing, int &latency)

    {

        uint32_t flags = req->getFlags();

        int seg = flags & SegmentFlagMask;

        bool storeCheck = flags & (StoreCheck << FlagShift);


        // If this is true, we're dealing with a request

        // to a non-memory address space.

        if (seg == SEGMENT_REG_MS) {

            return translateInt(mode == Mode::Read, req, tc);

        }


        delayedResponse = false;

        Addr vaddr = req->getVaddr();

        DPRINTF(GPUTLB, "Translating vaddr %#x.\n", vaddr);


        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);


        // If protected mode has been enabled...

        if (m5Reg.prot) {

            DPRINTF(GPUTLB, "In protected mode.\n");

            // If we're not in 64-bit mode, do protection/limit checks

            if (m5Reg.mode != LongMode) {

                DPRINTF(GPUTLB, "Not in long mode. Checking segment "

                        "protection.\n");


                // Check for a null segment selector.

                if (!(seg == SEGMENT_REG_TSG || seg == SYS_SEGMENT_REG_IDTR ||

                    seg == SEGMENT_REG_HS || seg == SEGMENT_REG_LS)

                    && !tc->readMiscRegNoEffect(MISCREG_SEG_SEL(seg))) {

                    return std::make_shared<GeneralProtection>(0);

                }


                bool expandDown = false;

                SegAttr attr = tc->readMiscRegNoEffect(MISCREG_SEG_ATTR(seg));


                if (seg >= SEGMENT_REG_ES && seg <= SEGMENT_REG_HS) {

                    if (!attr.writable && (mode == BaseTLB::Write ||

                        storeCheck))

                        return std::make_shared<GeneralProtection>(0);


                    if (!attr.readable && mode == BaseTLB::Read)

                        return std::make_shared<GeneralProtection>(0);


                    expandDown = attr.expandDown;


                }


                Addr base = tc->readMiscRegNoEffect(MISCREG_SEG_BASE(seg));

                Addr limit = tc->readMiscRegNoEffect(MISCREG_SEG_LIMIT(seg));

                // This assumes we're not in 64 bit mode. If we were, the

                // default address size is 64 bits, overridable to 32.

                int size = 32;

                bool sizeOverride = (flags & (AddrSizeFlagBit << FlagShift));

                SegAttr csAttr = tc->readMiscRegNoEffect(MISCREG_CS_ATTR);


                if ((csAttr.defaultSize && sizeOverride) ||

                    (!csAttr.defaultSize && !sizeOverride)) {

                    size = 16;

                }


                Addr offset = bits(vaddr - base, size - 1, 0);

                Addr endOffset = offset + req->getSize() - 1;


                if (expandDown) {

                    DPRINTF(GPUTLB, "Checking an expand down segment.\n");

                    warn_once("Expand down segments are untested.\n");


                    if (offset <= limit || endOffset <= limit)

                        return std::make_shared<GeneralProtection>(0);

                } else {

                    if (offset > limit || endOffset > limit)

                        return std::make_shared<GeneralProtection>(0);

                }

            }


            // If paging is enabled, do the translation.

            if (m5Reg.paging) {

                DPRINTF(GPUTLB, "Paging enabled.\n");

                // The vaddr already has the segment base applied.

                TlbEntry *entry = lookup(vaddr);

                localNumTLBAccesses++;


                if (!entry) {

                    localNumTLBMisses++;

                    if (timing) {

                        latency = missLatency1;

                    }


                    if (FullSystem) {

                        fatal("GpuTLB doesn't support full-system mode\n");

                    } else {

                        DPRINTF(GPUTLB, "Handling a TLB miss for address %#x "

                                "at pc %#x.\n", vaddr, tc->instAddr());


                        Process *p = tc->getProcessPtr();

                        const EmulationPageTable::Entry *pte =

                            p->pTable->lookup(vaddr);


                        if (!pte && mode != BaseTLB::Execute) {

                            // penalize a "page fault" more

                            if (timing)

                                latency += missLatency2;


                            if (p->fixupFault(vaddr))

                                pte = p->pTable->lookup(vaddr);

                        }


                        if (!pte) {

                            return std::make_shared<PageFault>(vaddr, true,

                                                               mode, true,

                                                               false);

                        } else {

                            Addr alignedVaddr = p->pTable->pageAlign(vaddr);


                            DPRINTF(GPUTLB, "Mapping %#x to %#x\n",

                                    alignedVaddr, pte->paddr);


                            TlbEntry gpuEntry(p->pid(), alignedVaddr,

                                              pte->paddr, false, false);

                            entry = insert(alignedVaddr, gpuEntry);

                        }


                        DPRINTF(GPUTLB, "Miss was serviced.\n");

                    }

                } else {

                    localNumTLBHits++;


                    if (timing) {

                        latency = hitLatency;

                    }

                }


                // Do paging protection checks.

                bool inUser = (m5Reg.cpl == 3 &&

                               !(flags & (CPL0FlagBit << FlagShift)));


                CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);

                bool badWrite = (!entry->writable && (inUser || cr0.wp));


                if ((inUser && !entry->user) || (mode == BaseTLB::Write &&

                     badWrite)) {

                    // The page must have been present to get into the TLB in

                    // the first place. We'll assume the reserved bits are

                    // fine even though we're not checking them.

                    return std::make_shared<PageFault>(vaddr, true, mode,

                                                       inUser, false);

                }


                if (storeCheck && badWrite) {

                    // This would fault if this were a write, so return a page

                    // fault that reflects that happening.

                    return std::make_shared<PageFault>(vaddr, true,

                                                       BaseTLB::Write,

                                                       inUser, false);

                }


                DPRINTF(GPUTLB, "Entry found with paddr %#x, doing protection "

                        "checks.\n", entry->paddr);


                int page_size = entry->size();

                Addr paddr = entry->paddr | (vaddr & (page_size - 1));

                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);

                req->setPaddr(paddr);


                if (entry->uncacheable)

                    req->setFlags(Request::UNCACHEABLE);

            } else {

                //Use the address which already has segmentation applied.

                DPRINTF(GPUTLB, "Paging disabled.\n");

                DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);

                req->setPaddr(vaddr);

            }

        } else {

            // Real mode

            DPRINTF(GPUTLB, "In real mode.\n");

            DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, vaddr);

            req->setPaddr(vaddr);

        }


        // Check for an access to the local APIC

        if (FullSystem) {

            LocalApicBase localApicBase =

                tc->readMiscRegNoEffect(MISCREG_APIC_BASE);


            Addr baseAddr = localApicBase.base * PageBytes;

            Addr paddr = req->getPaddr();


            if (baseAddr <= paddr && baseAddr + PageBytes > paddr) {

                // Force the access to be uncacheable.

                req->setFlags(Request::UNCACHEABLE);

                req->setPaddr(x86LocalAPICAddress(tc->contextId(),

                                                  paddr - baseAddr));

            }

        }


        return NoFault;

    };


    Fault

    GpuTLB::translateAtomic(const RequestPtr &req, ThreadContext *tc,

                            Mode mode, int &latency)

    {

        bool delayedResponse;


        return GpuTLB::translate(req, tc, nullptr, mode, delayedResponse,

            false, latency);

    }


    void

    GpuTLB::translateTiming(const RequestPtr &req, ThreadContext *tc,

            Translation *translation, Mode mode, int &latency)

    {

        bool delayedResponse;

        assert(translation);


        Fault fault = GpuTLB::translate(req, tc, translation, mode,

                                        delayedResponse, true, latency);


        if (!delayedResponse)

            translation->finish(fault, req, tc, mode);

    }


    Walker*

    GpuTLB::getWalker()

    {

        return walker;

    }


    void

    GpuTLB::serialize(CheckpointOut &cp) const

    {

    }


    void

    GpuTLB::unserialize(CheckpointIn &cp)

    {

    }


    void

    GpuTLB::regStats()

    {

        ClockedObject::regStats();


        localNumTLBAccesses

            .name(name() + ".local_TLB_accesses")

            .desc("Number of TLB accesses")

            ;


        localNumTLBHits

            .name(name() + ".local_TLB_hits")

            .desc("Number of TLB hits")

            ;


        localNumTLBMisses

            .name(name() + ".local_TLB_misses")

            .desc("Number of TLB misses")

            ;


        localTLBMissRate

            .name(name() + ".local_TLB_miss_rate")

            .desc("TLB miss rate")

            ;


        accessCycles

            .name(name() + ".access_cycles")

            .desc("Cycles spent accessing this TLB level")

            ;


        pageTableCycles

            .name(name() + ".page_table_cycles")

            .desc("Cycles spent accessing the page table")

            ;


        localTLBMissRate = 100 * localNumTLBMisses / localNumTLBAccesses;


        numUniquePages

            .name(name() + ".unique_pages")

            .desc("Number of unique pages touched")

            ;


        localCycles

            .name(name() + ".local_cycles")

            .desc("Number of cycles spent in queue for all incoming reqs")

            ;


        localLatency

            .name(name() + ".local_latency")

            .desc("Avg. latency over incoming coalesced reqs")

            ;


        localLatency = localCycles / localNumTLBAccesses;


        globalNumTLBAccesses

            .name(name() + ".global_TLB_accesses")

            .desc("Number of TLB accesses")

            ;


        globalNumTLBHits

            .name(name() + ".global_TLB_hits")

            .desc("Number of TLB hits")

            ;


        globalNumTLBMisses

            .name(name() + ".global_TLB_misses")

            .desc("Number of TLB misses")

            ;


        globalTLBMissRate

            .name(name() + ".global_TLB_miss_rate")

            .desc("TLB miss rate")

            ;


        globalTLBMissRate = 100 * globalNumTLBMisses / globalNumTLBAccesses;


        avgReuseDistance

            .name(name() + ".avg_reuse_distance")

            .desc("avg. reuse distance over all pages (in ticks)")

            ;


    }


    void

    GpuTLB::issueTLBLookup(PacketPtr pkt)

    {

        assert(pkt);

        assert(pkt->senderState);


        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),

                                        TheISA::PageBytes);


        TranslationState *sender_state =

                safe_cast<TranslationState*>(pkt->senderState);


        bool update_stats = !sender_state->prefetch;

        ThreadContext * tmp_tc = sender_state->tc;


        DPRINTF(GPUTLB, "Translation req. for virt. page addr %#x\n",

                virt_page_addr);


        int req_cnt = sender_state->reqCnt.back();


        if (update_stats) {

            accessCycles -= (curTick() * req_cnt);

            localCycles -= curTick();

            updatePageFootprint(virt_page_addr);

            globalNumTLBAccesses += req_cnt;

        }


        tlbOutcome lookup_outcome = TLB_MISS;

        const RequestPtr &tmp_req = pkt->req;


        // Access the TLB and figure out if it's a hit or a miss.

        bool success = tlbLookup(tmp_req, tmp_tc, update_stats);


        if (success) {

            lookup_outcome = TLB_HIT;

            // Put the entry in SenderState

            TlbEntry *entry = lookup(tmp_req->getVaddr(), false);

            assert(entry);


            auto p = sender_state->tc->getProcessPtr();

            sender_state->tlbEntry =

                new TlbEntry(p->pid(), entry->vaddr, entry->paddr,

                             false, false);


            if (update_stats) {

                // the reqCnt has an entry per level, so its size tells us

                // which level we are in

                sender_state->hitLevel = sender_state->reqCnt.size();

                globalNumTLBHits += req_cnt;

            }

        } else {

            if (update_stats)

                globalNumTLBMisses += req_cnt;

        }


        /*

         * We now know the TLB lookup outcome (if it's a hit or a miss), as

         * well as the TLB access latency.

         *

         * We create and schedule a new TLBEvent which will help us take the

         * appropriate actions (e.g., update TLB on a hit, send request to

         * lower level TLB on a miss, or start a page walk if this was the

         * last-level TLB)

         */

        TLBEvent *tlb_event =

            new TLBEvent(this, virt_page_addr, lookup_outcome, pkt);


        if (translationReturnEvent.count(virt_page_addr)) {

            panic("Virtual Page Address %#x already has a return event\n",

                  virt_page_addr);

        }


        translationReturnEvent[virt_page_addr] = tlb_event;

        assert(tlb_event);


        DPRINTF(GPUTLB, "schedule translationReturnEvent @ curTick %d\n",

                curTick() + cyclesToTicks(Cycles(hitLatency)));


        schedule(tlb_event, curTick() + cyclesToTicks(Cycles(hitLatency)));

    }


    GpuTLB::TLBEvent::TLBEvent(GpuTLB* _tlb, Addr _addr,

        tlbOutcome tlb_outcome, PacketPtr _pkt)

            : Event(CPU_Tick_Pri), tlb(_tlb), virtPageAddr(_addr),

              outcome(tlb_outcome), pkt(_pkt)

    {

    }


    void

    GpuTLB::pagingProtectionChecks(ThreadContext *tc, PacketPtr pkt,

            TlbEntry * tlb_entry, Mode mode)

    {

        HandyM5Reg m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG);

        uint32_t flags = pkt->req->getFlags();

        bool storeCheck = flags & (StoreCheck << FlagShift);


        // Do paging protection checks.

        bool inUser

            = (m5Reg.cpl == 3 && !(flags & (CPL0FlagBit << FlagShift)));

        CR0 cr0 = tc->readMiscRegNoEffect(MISCREG_CR0);


        bool badWrite = (!tlb_entry->writable && (inUser || cr0.wp));


        if ((inUser && !tlb_entry->user) ||

            (mode == BaseTLB::Write && badWrite)) {

            // The page must have been present to get into the TLB in

            // the first place. We'll assume the reserved bits are

            // fine even though we're not checking them.

            panic("Page fault detected");

        }


        if (storeCheck && badWrite) {

            // This would fault if this were a write, so return a page

            // fault that reflects that happening.

            panic("Page fault detected");

        }

    }


    void

    GpuTLB::handleTranslationReturn(Addr virt_page_addr,

        tlbOutcome tlb_outcome, PacketPtr pkt)

    {

        assert(pkt);

        Addr vaddr = pkt->req->getVaddr();


        TranslationState *sender_state =

            safe_cast<TranslationState*>(pkt->senderState);


        ThreadContext *tc = sender_state->tc;

        Mode mode = sender_state->tlbMode;


        TlbEntry *local_entry, *new_entry;


        if (tlb_outcome == TLB_HIT) {

            DPRINTF(GPUTLB, "Translation Done - TLB Hit for addr %#x\n",

                vaddr);

            local_entry = sender_state->tlbEntry;

        } else {

            DPRINTF(GPUTLB, "Translation Done - TLB Miss for addr %#x\n",

                    vaddr);


            new_entry = sender_state->tlbEntry;

            assert(new_entry);

            local_entry = new_entry;


            if (allocationPolicy) {

                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",

                        virt_page_addr);


                local_entry = insert(virt_page_addr, *new_entry);

            }


            assert(local_entry);

        }


        DPRINTF(GPUTLB, "Entry found with vaddr %#x,  doing protection checks "

                "while paddr was %#x.\n", local_entry->vaddr,

                local_entry->paddr);


        pagingProtectionChecks(tc, pkt, local_entry, mode);

        int page_size = local_entry->size();

        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));

        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);


        // Since this packet will be sent through the cpu side port,

        // it must be converted to a response pkt if it is not one already

        if (pkt->isRequest()) {

            pkt->makeTimingResponse();

        }


        pkt->req->setPaddr(paddr);


        if (local_entry->uncacheable) {

             pkt->req->setFlags(Request::UNCACHEABLE);

        }


        //send packet back to coalescer

        cpuSidePort[0]->sendTimingResp(pkt);

        //schedule cleanup event

        cleanupQueue.push(virt_page_addr);


        // schedule this only once per cycle.

        // The check is required because we might have multiple translations

        // returning the same cycle

        // this is a maximum priority event and must be on the same cycle

        // as the cleanup event in TLBCoalescer to avoid a race with

        // IssueProbeEvent caused by TLBCoalescer::MemSidePort::recvReqRetry

        if (!cleanupEvent.scheduled())

            schedule(cleanupEvent, curTick());

    }


    void

    GpuTLB::translationReturn(Addr virtPageAddr, tlbOutcome outcome,

                              PacketPtr pkt)

    {

        DPRINTF(GPUTLB, "Triggered TLBEvent for addr %#x\n", virtPageAddr);


        assert(translationReturnEvent[virtPageAddr]);

        assert(pkt);


        TranslationState *tmp_sender_state =

            safe_cast<TranslationState*>(pkt->senderState);


        int req_cnt = tmp_sender_state->reqCnt.back();

        bool update_stats = !tmp_sender_state->prefetch;


        if (outcome == TLB_HIT) {

            handleTranslationReturn(virtPageAddr, TLB_HIT, pkt);


            if (update_stats) {

                accessCycles += (req_cnt * curTick());

                localCycles += curTick();

            }


        } else if (outcome == TLB_MISS) {


            DPRINTF(GPUTLB, "This is a TLB miss\n");

            if (update_stats) {

                accessCycles += (req_cnt*curTick());

                localCycles += curTick();

            }


            if (hasMemSidePort) {

                // the one cyle added here represent the delay from when we get

                // the reply back till when we propagate it to the coalescer

                // above.

                if (update_stats) {

                    accessCycles += (req_cnt * 1);

                    localCycles += 1;

                }


                if (!memSidePort[0]->sendTimingReq(pkt)) {

                    DPRINTF(GPUTLB, "Failed sending translation request to "

                            "lower level TLB for addr %#x\n", virtPageAddr);


                    memSidePort[0]->retries.push_back(pkt);

                } else {

                    DPRINTF(GPUTLB, "Sent translation request to lower level "

                            "TLB for addr %#x\n", virtPageAddr);

                }

            } else {

                //this is the last level TLB. Start a page walk

                DPRINTF(GPUTLB, "Last level TLB - start a page walk for "

                        "addr %#x\n", virtPageAddr);


                if (update_stats)

                    pageTableCycles -= (req_cnt*curTick());


                TLBEvent *tlb_event = translationReturnEvent[virtPageAddr];

                assert(tlb_event);

                tlb_event->updateOutcome(PAGE_WALK);

                schedule(tlb_event,

                         curTick() + cyclesToTicks(Cycles(missLatency2)));

            }

        } else if (outcome == PAGE_WALK) {

            if (update_stats)

                pageTableCycles += (req_cnt*curTick());


            // Need to access the page table and update the TLB

            DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",

                    virtPageAddr);


            TranslationState *sender_state =

                safe_cast<TranslationState*>(pkt->senderState);


            Process *p = sender_state->tc->getProcessPtr();

            Addr vaddr = pkt->req->getVaddr();

    #ifndef NDEBUG

            Addr alignedVaddr = p->pTable->pageAlign(vaddr);

            assert(alignedVaddr == virtPageAddr);

    #endif

            const EmulationPageTable::Entry *pte = p->pTable->lookup(vaddr);

            if (!pte && sender_state->tlbMode != BaseTLB::Execute &&

                    p->fixupFault(vaddr)) {

                pte = p->pTable->lookup(vaddr);

            }


            if (pte) {

                DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,

                        pte->paddr);


                sender_state->tlbEntry =

                    new TlbEntry(p->pid(), virtPageAddr, pte->paddr, false,

                                 false);

            } else {

                sender_state->tlbEntry = nullptr;

            }


            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);

        } else if (outcome == MISS_RETURN) {

            handleTranslationReturn(virtPageAddr, TLB_MISS, pkt);

        } else {

            panic("Unexpected TLB outcome %d", outcome);

        }

    }


    void

    GpuTLB::TLBEvent::process()

    {

        tlb->translationReturn(virtPageAddr, outcome, pkt);

    }


    const char*

    GpuTLB::TLBEvent::description() const

    {

        return "trigger translationDoneEvent";

    }


    void

    GpuTLB::TLBEvent::updateOutcome(tlbOutcome _outcome)

    {

        outcome = _outcome;

    }


    Addr

    GpuTLB::TLBEvent::getTLBEventVaddr()

    {

        return virtPageAddr;

    }


    bool

    GpuTLB::CpuSidePort::recvTimingReq(PacketPtr pkt)

    {

        if (tlb->outstandingReqs < tlb->maxCoalescedReqs) {

            tlb->issueTLBLookup(pkt);

            // update number of outstanding translation requests

            tlb->outstandingReqs++;

            return true;

         } else {

            DPRINTF(GPUTLB, "Reached maxCoalescedReqs number %d\n",

                    tlb->outstandingReqs);

            return false;

         }

    }


    void

    GpuTLB::handleFuncTranslationReturn(PacketPtr pkt, tlbOutcome tlb_outcome)

    {

        TranslationState *sender_state =

            safe_cast<TranslationState*>(pkt->senderState);


        ThreadContext *tc = sender_state->tc;

        Mode mode = sender_state->tlbMode;

        Addr vaddr = pkt->req->getVaddr();


        TlbEntry *local_entry, *new_entry;


        if (tlb_outcome == TLB_HIT) {

            DPRINTF(GPUTLB, "Functional Translation Done - TLB hit for addr "

                    "%#x\n", vaddr);


            local_entry = sender_state->tlbEntry;

        } else {

            DPRINTF(GPUTLB, "Functional Translation Done - TLB miss for addr "

                    "%#x\n", vaddr);


            new_entry = sender_state->tlbEntry;

            assert(new_entry);

            local_entry = new_entry;


            if (allocationPolicy) {

                Addr virt_page_addr = roundDown(vaddr, TheISA::PageBytes);


                DPRINTF(GPUTLB, "allocating entry w/ addr %#x\n",

                        virt_page_addr);


                local_entry = insert(virt_page_addr, *new_entry);

            }


            assert(local_entry);

        }


        DPRINTF(GPUTLB, "Entry found with vaddr %#x, doing protection checks "

                "while paddr was %#x.\n", local_entry->vaddr,

                local_entry->paddr);


        if (!sender_state->prefetch && sender_state->tlbEntry)

            pagingProtectionChecks(tc, pkt, local_entry, mode);


        int page_size = local_entry->size();

        Addr paddr = local_entry->paddr | (vaddr & (page_size - 1));

        DPRINTF(GPUTLB, "Translated %#x -> %#x.\n", vaddr, paddr);


        pkt->req->setPaddr(paddr);


        if (local_entry->uncacheable)

             pkt->req->setFlags(Request::UNCACHEABLE);

    }


    // This is used for atomic translations. Need to

    // make it all happen during the same cycle.

    void

    GpuTLB::CpuSidePort::recvFunctional(PacketPtr pkt)

    {

        TranslationState *sender_state =

            safe_cast<TranslationState*>(pkt->senderState);


        ThreadContext *tc = sender_state->tc;

        bool update_stats = !sender_state->prefetch;


        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),

                                        TheISA::PageBytes);


        if (update_stats)

            tlb->updatePageFootprint(virt_page_addr);


        // do the TLB lookup without updating the stats

        bool success = tlb->tlbLookup(pkt->req, tc, update_stats);

        tlbOutcome tlb_outcome = success ? TLB_HIT : TLB_MISS;


        // functional mode means no coalescing

        // global metrics are the same as the local metrics

        if (update_stats) {

            tlb->globalNumTLBAccesses++;


            if (success) {

                sender_state->hitLevel = sender_state->reqCnt.size();

                tlb->globalNumTLBHits++;

            }

        }


        if (!success) {

            if (update_stats)

                tlb->globalNumTLBMisses++;

            if (tlb->hasMemSidePort) {

                // there is a TLB below -> propagate down the TLB hierarchy

                tlb->memSidePort[0]->sendFunctional(pkt);

                // If no valid translation from a prefetch, then just return

                if (sender_state->prefetch && !pkt->req->hasPaddr())

                    return;

            } else {

                // Need to access the page table and update the TLB

                DPRINTF(GPUTLB, "Doing a page walk for address %#x\n",

                        virt_page_addr);


                Process *p = tc->getProcessPtr();


                Addr vaddr = pkt->req->getVaddr();

    #ifndef NDEBUG

                Addr alignedVaddr = p->pTable->pageAlign(vaddr);

                assert(alignedVaddr == virt_page_addr);

    #endif


                const EmulationPageTable::Entry *pte =

                        p->pTable->lookup(vaddr);

                if (!pte && sender_state->tlbMode != BaseTLB::Execute &&

                        p->fixupFault(vaddr)) {

                    pte = p->pTable->lookup(vaddr);

                }


                if (!sender_state->prefetch) {

                    // no PageFaults are permitted after

                    // the second page table lookup

                    assert(pte);


                    DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,

                            pte->paddr);


                    sender_state->tlbEntry =

                        new TlbEntry(p->pid(), virt_page_addr,

                                     pte->paddr, false, false);

                } else {

                    // If this was a prefetch, then do the normal thing if it

                    // was a successful translation.  Otherwise, send an empty

                    // TLB entry back so that it can be figured out as empty

                    // and handled accordingly.

                    if (pte) {

                        DPRINTF(GPUTLB, "Mapping %#x to %#x\n", alignedVaddr,

                                pte->paddr);


                        sender_state->tlbEntry =

                            new TlbEntry(p->pid(), virt_page_addr,

                                         pte->paddr, false, false);

                    } else {

                        DPRINTF(GPUPrefetch, "Prefetch failed %#x\n",

                                alignedVaddr);


                        sender_state->tlbEntry = nullptr;


                        return;

                    }

                }

            }

        } else {

            DPRINTF(GPUPrefetch, "Functional Hit for vaddr %#x\n",

                    tlb->lookup(pkt->req->getVaddr()));


            TlbEntry *entry = tlb->lookup(pkt->req->getVaddr(),

                                             update_stats);


            assert(entry);


            auto p = sender_state->tc->getProcessPtr();

            sender_state->tlbEntry =

                new TlbEntry(p->pid(), entry->vaddr, entry->paddr,

                             false, false);

        }

        // This is the function that would populate pkt->req with the paddr of

        // the translation. But if no translation happens (i.e Prefetch fails)

        // then the early returns in the above code wiill keep this function

        // from executing.

        tlb->handleFuncTranslationReturn(pkt, tlb_outcome);

    }


    void

    GpuTLB::CpuSidePort::recvReqRetry()

    {

        // The CPUSidePort never sends anything but replies. No retries

        // expected.

        panic("recvReqRetry called");

    }


    AddrRangeList

    GpuTLB::CpuSidePort::getAddrRanges() const

    {

        // currently not checked by the requestor

        AddrRangeList ranges;


        return ranges;

    }


    bool

    GpuTLB::MemSidePort::recvTimingResp(PacketPtr pkt)

    {

        Addr virt_page_addr = roundDown(pkt->req->getVaddr(),

                                        TheISA::PageBytes);


        DPRINTF(GPUTLB, "MemSidePort recvTiming for virt_page_addr %#x\n",

                virt_page_addr);


        TLBEvent *tlb_event = tlb->translationReturnEvent[virt_page_addr];

        assert(tlb_event);

        assert(virt_page_addr == tlb_event->getTLBEventVaddr());


        tlb_event->updateOutcome(MISS_RETURN);

        tlb->schedule(tlb_event, curTick()+tlb->clockPeriod());


        return true;

    }


    void

    GpuTLB::MemSidePort::recvReqRetry()

    {

        // No retries should reach the TLB. The retries

        // should only reach the TLBCoalescer.

        panic("recvReqRetry called");

    }


    void

    GpuTLB::cleanup()

    {

        while (!cleanupQueue.empty()) {

            Addr cleanup_addr = cleanupQueue.front();

            cleanupQueue.pop();


            // delete TLBEvent

            TLBEvent * old_tlb_event = translationReturnEvent[cleanup_addr];

            delete old_tlb_event;

            translationReturnEvent.erase(cleanup_addr);


            // update number of outstanding requests

            outstandingReqs--;

        }


        for (int i = 0; i < cpuSidePort.size(); ++i) {

            cpuSidePort[i]->sendRetryReq();

        }

    }


    void

    GpuTLB::updatePageFootprint(Addr virt_page_addr)

    {


        std::pair<AccessPatternTable::iterator, bool> ret;


        AccessInfo tmp_access_info;

        tmp_access_info.lastTimeAccessed = 0;

        tmp_access_info.accessesPerPage = 0;

        tmp_access_info.totalReuseDistance = 0;

        tmp_access_info.sumDistance = 0;

        tmp_access_info.meanDistance = 0;


        ret = TLBFootprint.insert(

            AccessPatternTable::value_type(virt_page_addr, tmp_access_info));


        bool first_page_access = ret.second;


        if (first_page_access) {

            numUniquePages++;

        } else  {

            int accessed_before;

            accessed_before  = curTick() - ret.first->second.lastTimeAccessed;

            ret.first->second.totalReuseDistance += accessed_before;

        }


        ret.first->second.accessesPerPage++;

        ret.first->second.lastTimeAccessed = curTick();


        if (accessDistance) {

            ret.first->second.localTLBAccesses

                .push_back(localNumTLBAccesses.value());

        }

    }


    void

    GpuTLB::exitCallback()

    {

        std::ostream *page_stat_file = nullptr;


        if (accessDistance) {


            // print per page statistics to a separate file (.csv format)

            // simout is the gem5 output directory (default is m5out or the one

            // specified with -d

            page_stat_file = simout.create(name().c_str())->stream();


            // print header

            *page_stat_file

                << "page,max_access_distance,mean_access_distance, "

                << "stddev_distance" << std::endl;

        }


        // update avg. reuse distance footprint

        unsigned int sum_avg_reuse_distance_per_page = 0;


        // iterate through all pages seen by this TLB

        for (auto &iter : TLBFootprint) {

            sum_avg_reuse_distance_per_page += iter.second.totalReuseDistance /

                                               iter.second.accessesPerPage;


            if (accessDistance) {

                unsigned int tmp = iter.second.localTLBAccesses[0];

                unsigned int prev = tmp;


                for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {

                    if (i) {

                        tmp = prev + 1;

                    }


                    prev = iter.second.localTLBAccesses[i];

                    // update the localTLBAccesses value

                    // with the actual differece

                    iter.second.localTLBAccesses[i] -= tmp;

                    // compute the sum of AccessDistance per page

                    // used later for mean

                    iter.second.sumDistance +=

                        iter.second.localTLBAccesses[i];

                }


                iter.second.meanDistance =

                    iter.second.sumDistance / iter.second.accessesPerPage;


                // compute std_dev and max  (we need a second round because we

                // need to know the mean value

                unsigned int max_distance = 0;

                unsigned int stddev_distance = 0;


                for (int i = 0; i < iter.second.localTLBAccesses.size(); ++i) {

                    unsigned int tmp_access_distance =

                        iter.second.localTLBAccesses[i];


                    if (tmp_access_distance > max_distance) {

                        max_distance = tmp_access_distance;

                    }


                    unsigned int diff =

                        tmp_access_distance - iter.second.meanDistance;

                    stddev_distance += pow(diff, 2);


                }


                stddev_distance =

                    sqrt(stddev_distance/iter.second.accessesPerPage);


                if (page_stat_file) {

                    *page_stat_file << std::hex << iter.first << ",";

                    *page_stat_file << std::dec << max_distance << ",";

                    *page_stat_file << std::dec << iter.second.meanDistance

                                    << ",";

                    *page_stat_file << std::dec << stddev_distance;

                    *page_stat_file << std::endl;

                }


                // erase the localTLBAccesses array

                iter.second.localTLBAccesses.clear();

            }

        }


        if (!TLBFootprint.empty()) {

            avgReuseDistance =

                sum_avg_reuse_distance_per_page / TLBFootprint.size();

        }


        //clear the TLBFootprint map

        TLBFootprint.clear();

    }

} // namespace X86ISA


X86ISA::GpuTLB*

X86GPUTLBParams::create()

{

    return new X86ISA::GpuTLB(this);

}