develop/hsa__queue__entry_8hh_source.html

/*

 * Copyright (c) 2017-2018 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#ifndef __GPU_COMPUTE_HSA_QUEUE_ENTRY__

#define __GPU_COMPUTE_HSA_QUEUE_ENTRY__


#include <bitset>

#include <cstdint>

#include <cstring>

#include <iostream>

#include <vector>


#include "base/intmath.hh"

#include "base/types.hh"

#include "dev/hsa/hsa_packet.hh"

#include "dev/hsa/hsa_queue.hh"

#include "enums/GfxVersion.hh"

#include "gpu-compute/kernel_code.hh"


namespace gem5

{


class HSAQueueEntry

{

  public:

    HSAQueueEntry(std::string kernel_name, uint32_t queue_id,

                  int dispatch_id, void *disp_pkt, AMDKernelCode *akc,

                  Addr host_pkt_addr, Addr code_addr, GfxVersion gfx_version)

        : kernName(kernel_name),

          _wgSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_x,

                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_y,

                  (int)((_hsa_dispatch_packet_t*)disp_pkt)->workgroup_size_z}},

          _gridSize{{(int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_x,

                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_y,

                    (int)((_hsa_dispatch_packet_t*)disp_pkt)->grid_size_z}},

          numVgprs(akc->workitem_vgpr_count),

          numSgprs(akc->wavefront_sgpr_count),

          _queueId(queue_id), _dispatchId(dispatch_id), dispPkt(disp_pkt),

          _hostDispPktAddr(host_pkt_addr),

          _completionSignal(((_hsa_dispatch_packet_t*)disp_pkt)

                            ->completion_signal),

          codeAddress(code_addr),

          kernargAddress(((_hsa_dispatch_packet_t*)disp_pkt)->kernarg_address),

          _outstandingInvs(-1), _outstandingWbs(0),

          _ldsSize((int)((_hsa_dispatch_packet_t*)disp_pkt)->

                   group_segment_size),

          _privMemPerItem((int)((_hsa_dispatch_packet_t*)disp_pkt)->

                         private_segment_size),

          _contextId(0), _wgId{{ 0, 0, 0 }},

          _numWgTotal(1), numWgArrivedAtBarrier(0), _numWgCompleted(0),

          _globalWgId(0), dispatchComplete(false)


    {

        // Precompiled BLIT kernels actually violate the spec a bit

        // and don't set many of the required akc fields.  For these kernels,

        // we need to rip register usage from the resource registers.

        //

        // We can't get an exact number of registers from the resource

        // registers because they round, but we can get an upper bound on it.

        // We determine the number of registers by solving for "vgprs_used"

        // in the LLVM docs: https://www.llvm.org/docs/AMDGPUUsage.html

        //     #code-object-v3-kernel-descriptor

        // Currently, the only supported gfx version in gem5 that computes

        // this differently is gfx90a.

        if (!numVgprs) {

            if (gfx_version == GfxVersion::gfx90a) {

                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 8;

            } else {

                numVgprs = (akc->granulated_workitem_vgpr_count + 1) * 4;

            }

        }


        if (!numSgprs || numSgprs ==

            std::numeric_limits<decltype(akc->wavefront_sgpr_count)>::max()) {

            // Supported major generation numbers: 0 (BLIT kernels), 8, and 9

            uint16_t version = akc->amd_machine_version_major;

            assert((version == 0) || (version == 8) || (version == 9));

            // SGPR allocation granularies:

            // - GFX8: 8

            // - GFX9: 16

            // Source: https://llvm.org/docs/AMDGPUUsage.html

            if ((version == 0) || (version == 8)) {

                // We assume that BLIT kernels use the same granularity as GFX8

                numSgprs = (akc->granulated_wavefront_sgpr_count + 1) * 8;

            } else if (version == 9) {

                numSgprs = ((akc->granulated_wavefront_sgpr_count + 1) * 16)/2;

            }

        }


        initialVgprState.reset();

        initialSgprState.reset();


        for (int i = 0; i < MAX_DIM; ++i) {

            _numWg[i] = divCeil(_gridSize[i], _wgSize[i]);

            _numWgTotal *= _numWg[i];

        }


        parseKernelCode(akc);

    }


    const std::string&

    kernelName() const

    {

        return kernName;

    }


    int

    wgSize(int dim) const

    {

        assert(dim < MAX_DIM);

        return _wgSize[dim];

    }


    int

    gridSize(int dim) const

    {

        assert(dim < MAX_DIM);

        return _gridSize[dim];

    }


    int

    numVectorRegs() const

    {

        return numVgprs;

    }


    int

    numScalarRegs() const

    {

        return numSgprs;

    }


    uint32_t

    queueId() const

    {

        return _queueId;

    }


    int

    dispatchId() const

    {

        return _dispatchId;

    }


    void*

    dispPktPtr()

    {

        return dispPkt;

    }


    Addr

    hostDispPktAddr() const

    {

        return _hostDispPktAddr;

    }


    Addr

    completionSignal() const

    {

        return _completionSignal;

    }


    Addr

    codeAddr() const

    {

        return codeAddress;

    }


    Addr

    kernargAddr() const

    {

        return kernargAddress;

    }


    int

    ldsSize() const

    {

        return _ldsSize;

    }


    int privMemPerItem() const { return _privMemPerItem; }


    int

    contextId() const

    {

        return _contextId;

    }


    bool

    dispComplete() const

    {

        return dispatchComplete;

    }


    int

    wgId(int dim) const

    {

        assert(dim < MAX_DIM);

        return _wgId[dim];

    }


    void

    wgId(int dim, int val)

    {

        assert(dim < MAX_DIM);

        _wgId[dim] = val;

    }


    int

    globalWgId() const

    {

        return _globalWgId;

    }


    void

    globalWgId(int val)

    {

        _globalWgId = val;

    }


    int

    numWg(int dim) const

    {

        assert(dim < MAX_DIM);

        return _numWg[dim];

    }


    void

    notifyWgCompleted()

    {

        ++_numWgCompleted;

    }


    int

    numWgCompleted() const

    {

        return _numWgCompleted;

    }


    int

    numWgTotal() const

    {

        return _numWgTotal;

    }


    void

    markWgDispatch()

    {

        ++_wgId[0];

        ++_globalWgId;


        if (wgId(0) * wgSize(0) >= gridSize(0)) {

            _wgId[0] = 0;

            ++_wgId[1];


            if (wgId(1) * wgSize(1) >= gridSize(1)) {

                _wgId[1] = 0;

                ++_wgId[2];


                if (wgId(2) * wgSize(2) >= gridSize(2)) {

                    dispatchComplete = true;

                }

            }

        }

    }


    int

    numWgAtBarrier() const

    {

        return numWgArrivedAtBarrier;

    }


    bool vgprBitEnabled(int bit) const

    {

        return initialVgprState.test(bit);

    }


    bool sgprBitEnabled(int bit) const

    {

        return initialSgprState.test(bit);

    }


    Addr hostAMDQueueAddr;


    _amd_queue_t amdQueue;


    // the maximum number of dimensions for a grid or workgroup

    const static int MAX_DIM = 3;


    /* getter */

    int

    outstandingInvs() {

        return _outstandingInvs;

    }


    bool

    isInvStarted()

    {

        return (_outstandingInvs != -1);

    }


    void

    updateOutstandingInvs(int val)

    {

        _outstandingInvs += val;

        assert(_outstandingInvs >= 0);

    }


    void

    markInvDone()

    {

        _outstandingInvs = 0;

    }


    bool

    isInvDone() const

    {

        assert(_outstandingInvs >= 0);

        return (_outstandingInvs == 0);

    }


    int

    outstandingWbs() const

    {

        return _outstandingWbs;

    }


    void

    updateOutstandingWbs(int val)

    {

        _outstandingWbs += val;

        assert(_outstandingWbs >= 0);

    }


  private:

    void

    parseKernelCode(AMDKernelCode *akc)

    {

        initialSgprState.set(PrivateSegBuf,

            akc->enable_sgpr_private_segment_buffer);

        initialSgprState.set(DispatchPtr,

            akc->enable_sgpr_dispatch_ptr);

        initialSgprState.set(QueuePtr,

            akc->enable_sgpr_queue_ptr);

        initialSgprState.set(KernargSegPtr,

            akc->enable_sgpr_kernarg_segment_ptr);

        initialSgprState.set(DispatchId,

            akc->enable_sgpr_dispatch_id);

        initialSgprState.set(FlatScratchInit,

            akc->enable_sgpr_flat_scratch_init);

        initialSgprState.set(PrivateSegSize,

            akc->enable_sgpr_private_segment_size);

        initialSgprState.set(GridWorkgroupCountX,

            akc->enable_sgpr_grid_workgroup_count_x);

        initialSgprState.set(GridWorkgroupCountY,

            akc->enable_sgpr_grid_workgroup_count_y);

        initialSgprState.set(GridWorkgroupCountZ,

            akc->enable_sgpr_grid_workgroup_count_z);

        initialSgprState.set(WorkgroupIdX,

            akc->enable_sgpr_workgroup_id_x);

        initialSgprState.set(WorkgroupIdY,

            akc->enable_sgpr_workgroup_id_y);

        initialSgprState.set(WorkgroupIdZ,

            akc->enable_sgpr_workgroup_id_z);

        initialSgprState.set(WorkgroupInfo,

            akc->enable_sgpr_workgroup_info);

        initialSgprState.set(PrivSegWaveByteOffset,

            akc->enable_sgpr_private_segment_wave_byte_offset);


        initialVgprState.set(WorkitemIdX, true);

        initialVgprState.set(WorkitemIdY, akc->enable_vgpr_workitem_id > 0);

        initialVgprState.set(WorkitemIdZ, akc->enable_vgpr_workitem_id > 1);

    }


    // name of the kernel associated with the AQL entry

    std::string kernName;

    // workgroup Size (3 dimensions)

    std::array<int, MAX_DIM> _wgSize;

    // grid Size (3 dimensions)

    std::array<int, MAX_DIM> _gridSize;

    // total number of VGPRs per work-item

    int numVgprs;

    // total number of SGPRs per wavefront

    int numSgprs;

    // id of AQL queue in which this entry is placed

    uint32_t _queueId;

    int _dispatchId;

    // raw AQL packet pointer

    void *dispPkt;

    // host-side addr of the dispatch packet

    Addr _hostDispPktAddr;

    // pointer to bool

    Addr _completionSignal;

    // base address of the raw machine code

    Addr codeAddress;

    // base address of the kernel args

    Addr kernargAddress;

    int _outstandingInvs;

    int _outstandingWbs;

    int _ldsSize;

    int _privMemPerItem;

    int _contextId;

    std::array<int, MAX_DIM> _wgId;

    std::array<int, MAX_DIM> _numWg;

    int _numWgTotal;

    int numWgArrivedAtBarrier;

    // The number of completed work groups

    int _numWgCompleted;

    int _globalWgId;

    bool dispatchComplete;


    std::bitset<NumVectorInitFields> initialVgprState;

    std::bitset<NumScalarInitFields> initialSgprState;

};


} // namespace gem5


#endif // __GPU_COMPUTE_HSA_QUEUE_ENTRY__