develop/gpu__mem__helpers_8hh_source.html

/*

 * Copyright (c) 2021 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#ifndef __ARCH_VEGA_GPU_MEM_HELPERS_HH__

#define __ARCH_VEGA_GPU_MEM_HELPERS_HH__


#include "arch/amdgpu/vega/insts/gpu_static_inst.hh"

#include "arch/amdgpu/vega/insts/op_encodings.hh"

#include "debug/GPUMem.hh"

#include "gpu-compute/gpu_dyn_inst.hh"


namespace gem5

{


template<typename T, int N>

inline void


initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,

                 bool is_atomic=false)

{

    // local variables

    int req_size = N * sizeof(T);

    int block_size = gpuDynInst->computeUnit()->cacheLineSize();

    Addr vaddr = 0, split_addr = 0;

    bool misaligned_acc = false;

    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;

    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;


    gpuDynInst->resetEntireStatusVector();

    for (int lane = 0; lane < VegaISA::NumVecElemPerVecReg; ++lane) {

        if (gpuDynInst->exec_mask[lane]) {

            vaddr = gpuDynInst->addr[lane];


            split_addr = roundDown(vaddr + req_size - 1, block_size);


            assert(split_addr <= vaddr || split_addr - vaddr < block_size);

            misaligned_acc = split_addr > vaddr;


            if (is_atomic) {

                // make sure request is word aligned

                assert((vaddr & 0x3) == 0);


                // a given lane's atomic can't cross cache lines

                assert(!misaligned_acc);


                req = std::make_shared<Request>(vaddr, sizeof(T), 0,

                    gpuDynInst->computeUnit()->requestorId(), 0,

                    gpuDynInst->wfDynId,

                    gpuDynInst->makeAtomicOpFunctor<T>(

                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],

                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));

            } else {

                req = std::make_shared<Request>(vaddr, req_size, 0,

                                  gpuDynInst->computeUnit()->requestorId(), 0,

                                  gpuDynInst->wfDynId);

            }


            if (misaligned_acc) {

                gpuDynInst->setStatusVector(lane, 2);

                req->splitOnVaddr(split_addr, req1, req2);

                gpuDynInst->setRequestFlags(req1);

                gpuDynInst->setRequestFlags(req2);

                pkt1 = new Packet(req1, mem_req_type);

                pkt2 = new Packet(req2, mem_req_type);

                pkt1->dataStatic(&(reinterpret_cast<T*>(

                    gpuDynInst->d_data))[lane * N]);

                pkt2->dataStatic(&(reinterpret_cast<T*>(

                    gpuDynInst->d_data))[lane * N +

                                         req1->getSize()/sizeof(T)]);

                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "

                        "request for %#x\n", gpuDynInst->cu_id,

                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,

                        split_addr);

                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);

                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);

            } else {

                gpuDynInst->setStatusVector(lane, 1);

                gpuDynInst->setRequestFlags(req);

                pkt = new Packet(req, mem_req_type);

                pkt->dataStatic(&(reinterpret_cast<T*>(

                    gpuDynInst->d_data))[lane * N]);

                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);

            }

        } else { // if lane is not active, then no pending requests

            gpuDynInst->setStatusVector(lane, 0);

        }

    }

}


template<int N>

inline void


initScratchReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)

{

    // This function should be used for 1+ DWORD scratch accesses. 1+ DWORD

    // scratch accesses are special in that they send multiple single DWORD

    // requests in a swizzled manner to memory.

    int req_size = sizeof(VegaISA::VecElemU32);

    int block_size = gpuDynInst->computeUnit()->cacheLineSize();


    gpuDynInst->resetEntireStatusVector();

    for (int lane = 0; lane < VegaISA::NumVecElemPerVecReg; ++lane) {

        if (gpuDynInst->exec_mask[lane]) {

            Addr vaddr[N];


            for (int dword = 0; dword < N; ++dword) {

                int stride = VegaISA::NumVecElemPerVecReg

                           * sizeof(VegaISA::VecElemU32);

                vaddr[dword] = gpuDynInst->addr[lane] + dword * stride;


                // Do not allow misaligned for simplicity for now.

                Addr split_addr = roundDown(vaddr[dword] + req_size - 1,

                                            block_size);

                panic_if(split_addr > vaddr[dword], "Misaligned swizzled "

                        "scratch access not yet implemented\n");

            }


            gpuDynInst->setStatusVector(lane, N);


            RequestPtr req[N];

            PacketPtr pkt[N];

            for (int dword = 0; dword < N; ++dword) {

                req[dword] = std::make_shared<Request>(vaddr[dword], req_size,

                        0, gpuDynInst->computeUnit()->requestorId(), 0,

                        gpuDynInst->wfDynId);

                gpuDynInst->setRequestFlags(req[dword]);

                pkt[dword] = new Packet(req[dword], mem_req_type);


                int data_elem = lane + dword * VegaISA::NumVecElemPerVecReg;

                pkt[dword]->dataStatic(

                    &(reinterpret_cast<VegaISA::VecElemU32*>(

                        gpuDynInst->d_data))[data_elem]);


                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane,

                                                       pkt[dword]);

            }

        } else { // if lane is not active, then no pending requests

            gpuDynInst->setStatusVector(lane, 0);

        }

    }

}


template<typename T, int N>

inline void


initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)

{

    int req_size = N * sizeof(T);

    int block_size = gpuDynInst->computeUnit()->cacheLineSize();

    Addr vaddr = gpuDynInst->scalarAddr;


    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);


    assert(split_addr <= vaddr || split_addr - vaddr < block_size);

    bool misaligned_acc = split_addr > vaddr &&

      !gpuDynInst->staticInstruction()->hasNoAddr();


    Request::Flags flags;

    if (gpuDynInst->staticInstruction()->hasNoAddr()) {

        flags.set(Request::HAS_NO_ADDR);

    }

    RequestPtr req = std::make_shared<Request>(

        vaddr, req_size, std::move(flags),

        gpuDynInst->computeUnit()->requestorId(), 0,

        gpuDynInst->wfDynId);


    if (misaligned_acc) {

        RequestPtr req1, req2;

        req->splitOnVaddr(split_addr, req1, req2);

        gpuDynInst->numScalarReqs = 2;

        gpuDynInst->setRequestFlags(req1);

        gpuDynInst->setRequestFlags(req2);

        PacketPtr pkt1 = new Packet(req1, mem_req_type);

        PacketPtr pkt2 = new Packet(req2, mem_req_type);

        pkt1->dataStatic(gpuDynInst->scalar_data);

        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());

        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"

                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,

                gpuDynInst->wfSlotId, split_addr);

        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);

        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);

    } else {

        gpuDynInst->numScalarReqs = 1;

        gpuDynInst->setRequestFlags(req);

        PacketPtr pkt = new Packet(req, mem_req_type);

        pkt->dataStatic(gpuDynInst->scalar_data);

        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);

    }

}


} // namespace gem5


#endif // __ARCH_VEGA_GPU_MEM_HELPERS_HH__

gpu_static_inst.hh

DPRINTF
#define DPRINTF(x,...)
Definition trace.hh:209

gem5::MemCmd
Definition packet.hh:77

gem5::Packet
A Packet is used to encapsulate a transfer between two objects in the memory system (e....
Definition packet.hh:295

gem5::Packet::dataStatic
void dataStatic(T *p)
Set the data pointer to the following value that should not be freed.
Definition packet.hh:1175

gem5::Request::HAS_NO_ADDR
@ HAS_NO_ADDR
TLBI_EXT_SYNC_COMP seems to be the largest value of FlagsType, so HAS_NO_ADDR's value is that << 1.
Definition request.hh:261

gem5::Request::Flags
gem5::Flags< FlagsType > Flags
Definition request.hh:102

gpu_dyn_inst.hh

gem5::roundDown
static constexpr T roundDown(const T &val, const U &align)
This function is used to align addresses in memory.
Definition intmath.hh:279

gem5::Flags::set
void set(Type mask)
Set all flag's bits matching the given mask.
Definition flags.hh:116

panic_if
#define panic_if(cond,...)
Conditional panic macro that checks the supplied condition and only panics if the condition is true a...
Definition logging.hh:246

gem5::ArmISA::stride
Bitfield< 21, 20 > stride
Definition misc_types.hh:533

gem5::MipsISA::vaddr
vaddr
Definition pra_constants.hh:278

gem5::VegaISA::VecElemU32
uint32_t VecElemU32
Definition gpu_registers.hh:165

gem5::VegaISA::NumVecElemPerVecReg
const int NumVecElemPerVecReg(64)

gem5
Copyright (c) 2024 Arm Limited All rights reserved.
Definition binary32.hh:36

gem5::RequestPtr
std::shared_ptr< Request > RequestPtr
Definition request.hh:94

gem5::GPUDynInstPtr
std::shared_ptr< GPUDynInst > GPUDynInstPtr
Definition misc.hh:49

gem5::Addr
uint64_t Addr
Address type This will probably be moved somewhere else in the near future.
Definition types.hh:147

gem5::initMemReqHelper
void initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type, bool is_atomic=false)
Helper function for instructions declared in op_encodings.
Definition gpu_mem_helpers.hh:51

gem5::initScratchReqHelper
void initScratchReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
Definition gpu_mem_helpers.hh:134

gem5::PacketPtr
Packet * PacketPtr
Definition thread_context.hh:70

gem5::initMemReqScalarHelper
void initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)
Helper function for scalar instructions declared in op_encodings.
Definition gpu_mem_helpers.hh:192

op_encodings.hh