release/v20-1-0-0/gpu__mem__helpers_8hh_source.html

/*

 * Copyright (c) 2018 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * For use for simulation and test purposes only

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 *

 * Authors: Matt Sinclair

 */


#ifndef __ARCH_GCN3_GPU_MEM_HELPERS_HH__

#define __ARCH_GCN3_GPU_MEM_HELPERS_HH__


#include "arch/gcn3/insts/gpu_static_inst.hh"

#include "arch/gcn3/insts/op_encodings.hh"

#include "debug/GPUMem.hh"

#include "gpu-compute/gpu_dyn_inst.hh"


template<typename T, int N>

inline void

initMemReqHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type,

                 bool is_atomic=false)

{

    // local variables

    int req_size = N * sizeof(T);

    int block_size = gpuDynInst->computeUnit()->cacheLineSize();

    Addr vaddr = 0, split_addr = 0;

    bool misaligned_acc = false;

    RequestPtr req = nullptr, req1 = nullptr, req2 = nullptr;

    PacketPtr pkt = nullptr, pkt1 = nullptr, pkt2 = nullptr;


    gpuDynInst->resetEntireStatusVector();

    for (int lane = 0; lane < Gcn3ISA::NumVecElemPerVecReg; ++lane) {

        if (gpuDynInst->exec_mask[lane]) {

            vaddr = gpuDynInst->addr[lane];


            split_addr = roundDown(vaddr + req_size - 1, block_size);


            assert(split_addr <= vaddr || split_addr - vaddr < block_size);

            misaligned_acc = split_addr > vaddr;


            if (is_atomic) {

                // make sure request is word aligned

                assert((vaddr & 0x3) == 0);


                // a given lane's atomic can't cross cache lines

                assert(!misaligned_acc);


                req = std::make_shared<Request>(vaddr, sizeof(T), 0,

                    gpuDynInst->computeUnit()->requestorId(), 0,

                    gpuDynInst->wfDynId,

                    gpuDynInst->makeAtomicOpFunctor<T>(

                        &(reinterpret_cast<T*>(gpuDynInst->a_data))[lane],

                        &(reinterpret_cast<T*>(gpuDynInst->x_data))[lane]));

            } else {

                req = std::make_shared<Request>(vaddr, req_size, 0,

                                  gpuDynInst->computeUnit()->requestorId(), 0,

                                  gpuDynInst->wfDynId);

            }


            if (misaligned_acc) {

                gpuDynInst->setStatusVector(lane, 2);

                req->splitOnVaddr(split_addr, req1, req2);

                gpuDynInst->setRequestFlags(req1);

                gpuDynInst->setRequestFlags(req2);

                pkt1 = new Packet(req1, mem_req_type);

                pkt2 = new Packet(req2, mem_req_type);

                pkt1->dataStatic(&(reinterpret_cast<T*>(

                    gpuDynInst->d_data))[lane * N]);

                pkt2->dataStatic(&(reinterpret_cast<T*>(

                    gpuDynInst->d_data))[lane * N + req1->getSize()]);

                DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index: %d unaligned memory "

                        "request for %#x\n", gpuDynInst->cu_id,

                        gpuDynInst->simdId, gpuDynInst->wfSlotId, lane,

                        split_addr);

                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt1);

                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt2);

            } else {

                gpuDynInst->setStatusVector(lane, 1);

                gpuDynInst->setRequestFlags(req);

                pkt = new Packet(req, mem_req_type);

                pkt->dataStatic(&(reinterpret_cast<T*>(

                    gpuDynInst->d_data))[lane * N]);

                gpuDynInst->computeUnit()->sendRequest(gpuDynInst, lane, pkt);

            }

        } else { // if lane is not active, then no pending requests

            gpuDynInst->setStatusVector(lane, 0);

        }

    }

}


template<typename T, int N>

inline void

initMemReqScalarHelper(GPUDynInstPtr gpuDynInst, MemCmd mem_req_type)

{

    int req_size = N * sizeof(T);

    int block_size = gpuDynInst->computeUnit()->cacheLineSize();

    Addr vaddr = gpuDynInst->scalarAddr;


    Addr split_addr = roundDown(vaddr + req_size - 1, block_size);


    assert(split_addr <= vaddr || split_addr - vaddr < block_size);

    bool misaligned_acc = split_addr > vaddr;


    RequestPtr req = std::make_shared<Request>(vaddr, req_size, 0,

                                 gpuDynInst->computeUnit()->requestorId(), 0,

                                 gpuDynInst->wfDynId);


    if (misaligned_acc) {

        RequestPtr req1, req2;

        req->splitOnVaddr(split_addr, req1, req2);

        gpuDynInst->numScalarReqs = 2;

        gpuDynInst->setRequestFlags(req1);

        gpuDynInst->setRequestFlags(req2);

        PacketPtr pkt1 = new Packet(req1, mem_req_type);

        PacketPtr pkt2 = new Packet(req2, mem_req_type);

        pkt1->dataStatic(gpuDynInst->scalar_data);

        pkt2->dataStatic(gpuDynInst->scalar_data + req1->getSize());

        DPRINTF(GPUMem, "CU%d: WF[%d][%d]: unaligned scalar memory request for"

                " %#x\n", gpuDynInst->cu_id, gpuDynInst->simdId,

                gpuDynInst->wfSlotId, split_addr);

        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt1);

        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt2);

    } else {

        gpuDynInst->numScalarReqs = 1;

        gpuDynInst->setRequestFlags(req);

        PacketPtr pkt = new Packet(req, mem_req_type);

        pkt->dataStatic(gpuDynInst->scalar_data);

        gpuDynInst->computeUnit()->sendScalarRequest(gpuDynInst, pkt);

    }

}


#endif // __ARCH_GCN3_GPU_MEM_HELPERS_HH__