release/v20-1-0-0/dispatcher_8cc_source.html

/*

 * Copyright (c) 2011-2015,2018 Advanced Micro Devices, Inc.

 * All rights reserved.

 *

 * For use for simulation and test purposes only

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions are met:

 *

 * 1. Redistributions of source code must retain the above copyright notice,

 * this list of conditions and the following disclaimer.

 *

 * 2. Redistributions in binary form must reproduce the above copyright notice,

 * this list of conditions and the following disclaimer in the documentation

 * and/or other materials provided with the distribution.

 *

 * 3. Neither the name of the copyright holder nor the names of its

 * contributors may be used to endorse or promote products derived from this

 * software without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE

 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

 * POSSIBILITY OF SUCH DAMAGE.

 */


#include "gpu-compute/dispatcher.hh"


#include "debug/GPUDisp.hh"

#include "debug/GPUKernelInfo.hh"

#include "debug/GPUWgLatency.hh"

#include "gpu-compute/gpu_command_processor.hh"

#include "gpu-compute/hsa_queue_entry.hh"

#include "gpu-compute/shader.hh"

#include "gpu-compute/wavefront.hh"

#include "sim/syscall_emul_buf.hh"

#include "sim/system.hh"


GPUDispatcher::GPUDispatcher(const Params *p)

    : SimObject(p), shader(nullptr), gpuCmdProc(nullptr),

      tickEvent([this]{ exec(); },

          "GPU Dispatcher tick", false, Event::CPU_Tick_Pri),

      dispatchActive(false)

{

    schedule(&tickEvent, 0);

}


GPUDispatcher::~GPUDispatcher()

{

}


void

GPUDispatcher::regStats()

{

    numKernelLaunched

    .name(name() + ".num_kernel_launched")

    .desc("number of kernel launched")

    ;


    cyclesWaitingForDispatch

    .name(name() + ".cycles_wait_dispatch")

    .desc("number of cycles with outstanding wavefronts "

          "that are waiting to be dispatched")

    ;

}


HSAQueueEntry*

GPUDispatcher::hsaTask(int disp_id)

{

    assert(hsaQueueEntries.find(disp_id) != hsaQueueEntries.end());

    return hsaQueueEntries[disp_id];

}


void

GPUDispatcher::setCommandProcessor(GPUCommandProcessor *gpu_cmd_proc)

{

    gpuCmdProc = gpu_cmd_proc;

}


void

GPUDispatcher::setShader(Shader *new_shader)

{

    shader = new_shader;

}


void

GPUDispatcher::serialize(CheckpointOut &cp) const

{

    Tick event_tick = 0;


    if (tickEvent.scheduled())

        event_tick = tickEvent.when();


    SERIALIZE_SCALAR(event_tick);

}


void

GPUDispatcher::unserialize(CheckpointIn &cp)

{

    Tick event_tick;


    if (tickEvent.scheduled())

        deschedule(&tickEvent);


    UNSERIALIZE_SCALAR(event_tick);


    if (event_tick) {

        schedule(&tickEvent, event_tick);

    }

}


void

GPUDispatcher::dispatch(HSAQueueEntry *task)

{

    ++numKernelLaunched;


    DPRINTF(GPUDisp, "launching kernel: %s, dispatch ID: %d\n",

            task->kernelName(), task->dispatchId());


    execIds.push(task->dispatchId());

    dispatchActive = true;

    hsaQueueEntries.emplace(task->dispatchId(), task);


    if (!tickEvent.scheduled()) {

        schedule(&tickEvent, curTick() + shader->clockPeriod());

    }

}


void

GPUDispatcher::exec()

{

    int fail_count(0);


    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());


    if (execIds.size() > 0) {

        ++cyclesWaitingForDispatch;

    }


    while (execIds.size() > fail_count) {

        int exec_id = execIds.front();

        auto task = hsaQueueEntries[exec_id];

        bool launched(false);


        // acq is needed before starting dispatch

        if (shader->impl_kern_launch_acq) {

            // try to invalidate cache

            shader->prepareInvalidate(task);

        } else {

            // kern launch acquire is not set, skip invalidate

            task->markInvDone();

        }


        if (!task->isInvDone()){

            execIds.push(exec_id);

            ++fail_count;


            DPRINTF(GPUDisp, "kernel %d failed to launch, due to [%d] pending"

                " invalidate requests\n", exec_id, task->outstandingInvs());


            // try the next kernel_id

            execIds.pop();

            continue;

        }


        // kernel invalidate is done, start workgroup dispatch

        while (!task->dispComplete()) {

            // update the thread context

            shader->updateContext(task->contextId());


            // attempt to dispatch workgroup

            DPRINTF(GPUWgLatency, "Attempt Kernel Launch cycle:%d kernel:%d\n",

                curTick(), exec_id);


            if (!shader->dispatchWorkgroups(task)) {

                DPRINTF(GPUDisp, "kernel %d failed to launch\n", exec_id);

                execIds.push(exec_id);

                ++fail_count;

                break;

            } else if (!launched) {

                launched = true;

                DPRINTF(GPUKernelInfo, "Launched kernel %d\n", exec_id);

            }

        }


        // try the next kernel_id

        execIds.pop();

    }


    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());


    while (doneIds.size()) {

        DPRINTF(GPUDisp, "Kernel %d completed\n", doneIds.front());

        doneIds.pop();

    }

}


bool

GPUDispatcher::isReachingKernelEnd(Wavefront *wf)

{

    int kern_id = wf->kernId;

    assert(hsaQueueEntries.find(kern_id) != hsaQueueEntries.end());

    auto task = hsaQueueEntries[kern_id];

    assert(task->dispatchId() == kern_id);


    return (task->numWgCompleted() + 1 == task->numWgTotal());

}


void

GPUDispatcher::updateInvCounter(int kern_id, int val) {

    assert(val == -1 || val == 1);


    auto task = hsaQueueEntries[kern_id];

    task->updateOutstandingInvs(val);


    // kernel invalidate is done, schedule dispatch work

    if (task->isInvDone() && !tickEvent.scheduled()) {

        schedule(&tickEvent, curTick() + shader->clockPeriod());

    }

}


bool

GPUDispatcher::updateWbCounter(int kern_id, int val) {

    assert(val == -1 || val == 1);


    auto task = hsaQueueEntries[kern_id];

    task->updateOutstandingWbs(val);


    // true: WB is done, false: WB is still ongoing

    return (task->outstandingWbs() == 0);

}


int

GPUDispatcher::getOutstandingWbs(int kernId) {

    auto task = hsaQueueEntries[kernId];


    return task->outstandingWbs();

}


void

GPUDispatcher::notifyWgCompl(Wavefront *wf)

{

    int kern_id = wf->kernId;

    DPRINTF(GPUDisp, "notify WgCompl %d\n", wf->wgId);

    auto task = hsaQueueEntries[kern_id];

    assert(task->dispatchId() == kern_id);

    task->notifyWgCompleted();


    DPRINTF(GPUWgLatency, "WG Complete cycle:%d wg:%d kernel:%d cu:%d\n",

        curTick(), wf->wgId, kern_id, wf->computeUnit->cu_id);


    if (task->numWgCompleted() == task->numWgTotal()) {

        // Notify the HSA PP that this kernel is complete

        gpuCmdProc->hsaPacketProc()

            .finishPkt(task->dispPktPtr(), task->queueId());

        if (task->completionSignal()) {

            // The signal value is aligned 8 bytes from

            // the actual handle in the runtime

            Addr signal_addr = task->completionSignal() + sizeof(Addr);

            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! Triggering "

                    "completion signal: %x!\n", signal_addr);


            auto *tc = gpuCmdProc->system()->threads[0];

            auto &virt_proxy = tc->getVirtProxy();

            TypedBufferArg<Addr> prev_signal(signal_addr);

            prev_signal.copyIn(virt_proxy);


            Addr *new_signal = new Addr;

            *new_signal = (Addr)*prev_signal - 1;


            gpuCmdProc->dmaWriteVirt(signal_addr, sizeof(Addr), nullptr,

                new_signal, 0);

        } else {

            DPRINTF(GPUDisp, "HSA AQL Kernel Complete! No completion "

                "signal\n");

        }


        DPRINTF(GPUWgLatency, "Kernel Complete ticks:%d kernel:%d\n",

                curTick(), kern_id);

        DPRINTF(GPUKernelInfo, "Completed kernel %d\n", kern_id);

    }


    if (!tickEvent.scheduled()) {

        schedule(&tickEvent, curTick() + shader->clockPeriod());

    }

}


void

GPUDispatcher::scheduleDispatch()

{

    if (!tickEvent.scheduled()) {

        schedule(&tickEvent, curTick() + shader->clockPeriod());

    }

}


GPUDispatcher *GPUDispatcherParams::create()

{

    return new GPUDispatcher(this);

}