develop/global__memory__pipeline_8cc_source.html

/*

 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#define __STDC_FORMAT_MACROS

#include <cinttypes>

#include "debug/GPUCoalescer.hh"

#include "debug/GPUMem.hh"

#include "debug/GPUReg.hh"

#include "gpu-compute/compute_unit.hh"

#include "gpu-compute/global_memory_pipeline.hh"

#include "gpu-compute/gpu_dyn_inst.hh"

#include "gpu-compute/shader.hh"

#include "gpu-compute/vector_register_file.hh"

#include "gpu-compute/wavefront.hh"


namespace gem5

{


GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams &p,

                                     ComputeUnit &cu)

    : computeUnit(cu), _name(cu.name() + ".GlobalMemPipeline"),

      gmQueueSize(p.global_mem_queue_size),

      maxWaveRequests(p.max_wave_requests), inflightStores(0),

      inflightLoads(0), stats(&cu)

{

}


void


GlobalMemPipeline::init()

{

    globalMemSize = computeUnit.shader->globalMemSize;

}


bool


GlobalMemPipeline::coalescerReady(GPUDynInstPtr mp) const

{

    // System requests do not need GPU coalescer tokens. Make sure nothing

    // has bypassed the operand gather check stage.

    assert(!mp->isSystemReq());


    // We require one token from the coalescer's uncoalesced table to

    // proceed

    int token_count = 1;


    // Make sure the vector port has tokens. There is a single pool

    // of tokens so only one port in the vector port needs to be checked.

    // Lane 0 is chosen arbirarily.

    DPRINTF(GPUCoalescer, "Checking for %d tokens\n", token_count);

    if (!mp->computeUnit()->getTokenManager()->haveTokens(token_count)) {

        DPRINTF(GPUCoalescer, "Stalling inst because coalsr is busy!\n");

        return false;

    }


    return true;

}


void


GlobalMemPipeline::acqCoalescerToken(GPUDynInstPtr mp)

{

    // We require one token from the coalescer's uncoalesced table to

    // proceed

    int token_count = 1;


    DPRINTF(GPUCoalescer, "Acquiring %d token(s)\n", token_count);

    assert(mp->computeUnit()->getTokenManager()->haveTokens(token_count));

    mp->computeUnit()->getTokenManager()->acquireTokens(token_count);

}


bool


GlobalMemPipeline::outstandingReqsCheck(GPUDynInstPtr mp) const

{

    // Ensure we haven't exceeded the maximum number of vmem requests

    // for this wavefront

    if ((mp->wavefront()->outstandingReqsRdGm

         + mp->wavefront()->outstandingReqsWrGm) >= maxWaveRequests) {

        return false;

    }


    return true;

}


void


GlobalMemPipeline::exec()

{

    // apply any returned global memory operations

    GPUDynInstPtr m = getNextReadyResp();


    bool accessVrf = true;

    Wavefront *w = nullptr;


    // check the VRF to see if the operands of a load (or load component

    // of an atomic) are accessible

    if (m && (m->isLoad() || m->isAtomicRet())) {

        w = m->wavefront();


        accessVrf = w->computeUnit->vrf[w->simdId]->

            canScheduleWriteOperandsFromLoad(w, m);


    }


    if (m && m->latency.rdy() && computeUnit.glbMemToVrfBus.rdy() &&

        accessVrf && (computeUnit.shader->coissue_return ||

        computeUnit.vectorGlobalMemUnit.rdy())) {


        w = m->wavefront();


        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: Completing global mem instr %s\n",

                m->cu_id, m->simdId, m->wfSlotId, m->disassemble());

        m->completeAcc(m);

        if (m->isFlat()) {

            w->decLGKMInstsIssued();

            w->untrackLGKMInst(m);

        }

        w->decVMemInstsIssued();

        w->untrackVMemInst(m);


        if (m->isLoad() || m->isAtomicRet()) {

            w->computeUnit->vrf[w->simdId]->

            scheduleWriteOperandsFromLoad(w, m);

        }


        completeRequest(m);


        Tick accessTime = curTick() - m->getAccessTime();


        // Decrement outstanding requests count

        computeUnit.shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);

        if (m->isStore() || m->isAtomic() || m->isMemSync()) {

            computeUnit.shader->sampleStore(accessTime);

            computeUnit.shader->ScheduleAdd(&w->outstandingReqsWrGm,

                                             m->time, -1);

        }


        if (m->isLoad() || m->isAtomic() || m->isMemSync()) {

            computeUnit.shader->sampleLoad(accessTime);

            computeUnit.shader->ScheduleAdd(&w->outstandingReqsRdGm,

                                             m->time, -1);

        }


        w->validateRequestCounters();


        // Generate stats for round-trip time for vectory memory insts

        // going all the way to memory and stats for individual cache

        // blocks generated by the instruction.

        m->profileRoundTripTime(curTick(), InstMemoryHop::Complete);

        computeUnit.shader->sampleInstRoundTrip(m->getRoundTripTime());

        computeUnit.shader->sampleLineRoundTrip(m->getLineAddressTime());


        // Mark write bus busy for appropriate amount of time

        computeUnit.glbMemToVrfBus.set(m->time);

        if (!computeUnit.shader->coissue_return)

            w->computeUnit->vectorGlobalMemUnit.set(m->time);

    }


    // If pipeline has executed a global memory instruction

    // execute global memory packets and issue global

    // memory packets to DTLB

    if (!gmIssuedRequests.empty()) {

        GPUDynInstPtr mp = gmIssuedRequests.front();

        if (mp->isLoad() || mp->isAtomic()) {

            if (inflightLoads >= gmQueueSize) {

                return;

            } else {

                ++inflightLoads;

            }

        } else if (mp->isStore()) {

            if (inflightStores >= gmQueueSize) {

                return;

            } else {

                ++inflightStores;

            }

        }


        DPRINTF(GPUCoalescer, "initiateAcc for %s seqNum %d\n",

                mp->disassemble(), mp->seqNum());

        mp->initiateAcc(mp);


        if (mp->isStore() && mp->isGlobalSeg()) {

            mp->wavefront()->decExpInstsIssued();

            mp->wavefront()->untrackExpInst(mp);

        }


        if (((mp->isMemSync() && !mp->isEndOfKernel()) || !mp->isMemSync())) {

            gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(),

                std::make_pair(mp, false)));

        }


        if (!mp->isMemSync() && !mp->isEndOfKernel() && mp->allLanesZero()) {

            handleResponse(mp);

            computeUnit.getTokenManager()->recvTokens(1);

        }


        gmIssuedRequests.pop();


        DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n",

                computeUnit.cu_id, mp->simdId, mp->wfSlotId);

    }

}


GPUDynInstPtr


GlobalMemPipeline::getNextReadyResp()

{

    if (!gmOrderedRespBuffer.empty()) {

        auto mem_req = gmOrderedRespBuffer.begin();


        if (mem_req->second.second) {

            return mem_req->second.first;

        }

    }


    return nullptr;

}


void


GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst)

{

    if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) {

        assert(inflightLoads > 0);

        --inflightLoads;

    } else if (gpuDynInst->isStore()) {

        assert(inflightStores > 0);

        --inflightStores;

    }


    // we should only pop the oldest requst, and it

    // should be marked as done if we are here

    assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum());

    assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst);

    assert(gmOrderedRespBuffer.begin()->second.second);

    // remove this instruction from the buffer by its

    // unique seq ID

    gmOrderedRespBuffer.erase(gpuDynInst->seqNum());

}


void


GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst)

{

    Wavefront *wf = gpuDynInst->wavefront();

    if (gpuDynInst->isLoad()) {

        wf->rdGmReqsInPipe--;

        wf->outstandingReqsRdGm++;

    } else if (gpuDynInst->isStore()) {

        wf->wrGmReqsInPipe--;

        wf->outstandingReqsWrGm++;

    } else {

        // Atomic, both read and write

        wf->rdGmReqsInPipe--;

        wf->outstandingReqsRdGm++;

        wf->wrGmReqsInPipe--;

        wf->outstandingReqsWrGm++;

    }


    wf->outstandingReqs++;

    wf->validateRequestCounters();


    gpuDynInst->setAccessTime(curTick());

    gpuDynInst->profileRoundTripTime(curTick(), InstMemoryHop::Initiate);

    gmIssuedRequests.push(gpuDynInst);

}


void


GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst)

{

    auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum());

    // if we are getting a response for this mem request,

    // then it ought to already be in the ordered response

    // buffer

    assert(mem_req != gmOrderedRespBuffer.end());

    mem_req->second.second = true;

}


void


GlobalMemPipeline::printProgress()

{

    std::cout << "GMPipe inflight: " << inflightLoads << "/" << inflightStores

              << " issued: " << gmIssuedRequests.size() << " returned: "

              << gmOrderedRespBuffer.size() << " -- :\n";


    for (auto &pair : gmOrderedRespBuffer) {

        auto &inst_pair = pair.second;

        auto &inst = inst_pair.first;

        std::cout << "\t" << inst->disassemble() << " -- " << inst_pair.second

                  << "\n";

    }

}


GlobalMemPipeline::

GlobalMemPipelineStats::GlobalMemPipelineStats(statistics::Group *parent)

    : statistics::Group(parent, "GlobalMemPipeline"),

      ADD_STAT(loadVrfBankConflictCycles, "total number of cycles GM data "

               "are delayed before updating the VRF")

{

}


} // namespace gem5

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

gem5::ComputeUnit
Definition compute_unit.hh:204

gem5::GlobalMemPipeline::getNextReadyResp
GPUDynInstPtr getNextReadyResp()
Find the next ready response to service.
Definition global_memory_pipeline.cc:247

gem5::GlobalMemPipeline::inflightLoads
int inflightLoads
Definition global_memory_pipeline.hh:124

gem5::GlobalMemPipeline::completeRequest
void completeRequest(GPUDynInstPtr gpuDynInst)
once a memory request is finished we remove it from the buffer.
Definition global_memory_pipeline.cc:261

gem5::GlobalMemPipeline::exec
void exec()
Definition global_memory_pipeline.cc:111

gem5::GlobalMemPipeline::gmQueueSize
int gmQueueSize
Definition global_memory_pipeline.hh:117

gem5::GlobalMemPipeline::issueRequest
void issueRequest(GPUDynInstPtr gpuDynInst)
Issues a request to the pipeline (i.e., enqueue it in the request buffer).
Definition global_memory_pipeline.cc:282

gem5::GlobalMemPipeline::outstandingReqsCheck
bool outstandingReqsCheck(GPUDynInstPtr mp) const
Definition global_memory_pipeline.cc:98

gem5::GlobalMemPipeline::init
void init()
Definition global_memory_pipeline.cc:57

gem5::GlobalMemPipeline::globalMemSize
int globalMemSize
Definition global_memory_pipeline.hh:127

gem5::GlobalMemPipeline::handleResponse
void handleResponse(GPUDynInstPtr gpuDynInst)
This method handles responses sent to this GM pipeline by the CU.
Definition global_memory_pipeline.cc:308

gem5::GlobalMemPipeline::printProgress
void printProgress()
Definition global_memory_pipeline.cc:319

gem5::GlobalMemPipeline::maxWaveRequests
int maxWaveRequests
Definition global_memory_pipeline.hh:118

gem5::GlobalMemPipeline::acqCoalescerToken
void acqCoalescerToken(GPUDynInstPtr mp)
Definition global_memory_pipeline.cc:86

gem5::GlobalMemPipeline::_name
const std::string _name
Definition global_memory_pipeline.hh:116

gem5::GlobalMemPipeline::GlobalMemPipeline
GlobalMemPipeline(const ComputeUnitParams &p, ComputeUnit &cu)
Definition global_memory_pipeline.cc:47

gem5::GlobalMemPipeline::gmOrderedRespBuffer
std::map< uint64_t, std::pair< GPUDynInstPtr, bool > > gmOrderedRespBuffer
Definition global_memory_pipeline.hh:142

gem5::GlobalMemPipeline::gmIssuedRequests
std::queue< GPUDynInstPtr > gmIssuedRequests
Definition global_memory_pipeline.hh:146

gem5::GlobalMemPipeline::inflightStores
int inflightStores
Definition global_memory_pipeline.hh:123

gem5::GlobalMemPipeline::computeUnit
ComputeUnit & computeUnit
Definition global_memory_pipeline.hh:115

gem5::GlobalMemPipeline::coalescerReady
bool coalescerReady(GPUDynInstPtr mp) const
Definition global_memory_pipeline.cc:63

gem5::GlobalMemPipeline::name
const std::string & name() const
Definition global_memory_pipeline.hh:100

gem5::GlobalMemPipeline::stats
gem5::GlobalMemPipeline::GlobalMemPipelineStats stats

gem5::Wavefront
Definition wavefront.hh:62

gem5::Wavefront::validateRequestCounters
void validateRequestCounters()
Definition wavefront.cc:829

gem5::Wavefront::outstandingReqsWrGm
int outstandingReqsWrGm
Definition wavefront.hh:179

gem5::Wavefront::wrGmReqsInPipe
int wrGmReqsInPipe
Definition wavefront.hh:193

gem5::Wavefront::rdGmReqsInPipe
int rdGmReqsInPipe
Definition wavefront.hh:191

gem5::Wavefront::outstandingReqsRdGm
int outstandingReqsRdGm
Definition wavefront.hh:183

gem5::Wavefront::outstandingReqs
int outstandingReqs
Definition wavefront.hh:177

gem5::statistics::Group
Statistics container.
Definition group.hh:93

compute_unit.hh

global_memory_pipeline.hh

gpu_dyn_inst.hh

ADD_STAT
#define ADD_STAT(n,...)
Convenience macro to add a stat to a statistics group.
Definition group.hh:75

gem5::statistics::Group::Group
Group()=delete

gem5::ArmISA::m
Bitfield< 0 > m
Definition misc_types.hh:482

gem5::ArmISA::mp
Bitfield< 11 > mp
Definition misc_types.hh:977

gem5::MipsISA::p
Bitfield< 0 > p
Definition pra_constants.hh:326

gem5::MipsISA::w
Bitfield< 0 > w
Definition pra_constants.hh:281

gem5::statistics
Definition statistics.cc:57

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::curTick
Tick curTick()
The universal simulation clock.
Definition cur_tick.hh:46

gem5::Initiate
@ Initiate
Definition misc.hh:53

gem5::Complete
@ Complete
Definition misc.hh:57

gem5::Tick
uint64_t Tick
Tick count type.
Definition types.hh:58

shader.hh

gem5::GlobalMemPipeline::GlobalMemPipelineStats::loadVrfBankConflictCycles
statistics::Scalar loadVrfBankConflictCycles
Definition global_memory_pipeline.hh:156

gem5::GlobalMemPipeline::GlobalMemPipelineStats::GlobalMemPipelineStats
GlobalMemPipelineStats(statistics::Group *parent)
Definition global_memory_pipeline.cc:334

vector_register_file.hh

wavefront.hh