gem5/src/gpu-compute/dispatcher.cc

/*
 * Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
 * All rights reserved.
 *
 * For use for simulation and test purposes only
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its contributors
 * may be used to endorse or promote products derived from this software
 * without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Author: Brad Beckmann, Marc Orr
 */


#include "gpu-compute/dispatcher.hh"

#include "cpu/base.hh"
#include "debug/GPUDisp.hh"
#include "gpu-compute/cl_driver.hh"
#include "gpu-compute/cl_event.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/wavefront.hh"
#include "mem/packet_access.hh"

GpuDispatcher *GpuDispatcher::instance = nullptr;

GpuDispatcher::GpuDispatcher(const Params *p)
    : DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),
      pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),
      dispatchCount(0), dispatchActive(false), cpu(p->cpu),
      shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)
{
    shader->handshake(this);
    driver->handshake(this);

    ndRange.wg_disp_rem = false;
    ndRange.globalWgId = 0;

    schedule(&tickEvent, 0);

    // translation port for the dispatcher
    tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);

    num_kernelLaunched
    .name(name() + ".num_kernel_launched")
    .desc("number of kernel launched")
    ;
}

GpuDispatcher *GpuDispatcherParams::create()
{
    GpuDispatcher *dispatcher = new GpuDispatcher(this);
    GpuDispatcher::setInstance(dispatcher);

    return GpuDispatcher::getInstance();
}

void
GpuDispatcher::serialize(CheckpointOut &cp) const
{
    Tick event_tick = 0;

    if (ndRange.wg_disp_rem)
        fatal("Checkpointing not supported during active workgroup execution");

    if (tickEvent.scheduled())
        event_tick = tickEvent.when();

    SERIALIZE_SCALAR(event_tick);

}

void
GpuDispatcher::unserialize(CheckpointIn &cp)
{
    Tick event_tick;

    if (tickEvent.scheduled())
        deschedule(&tickEvent);

    UNSERIALIZE_SCALAR(event_tick);

    if (event_tick)
        schedule(&tickEvent, event_tick);
}

AddrRangeList
GpuDispatcher::getAddrRanges() const
{
    AddrRangeList ranges;

    DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",
            pioAddr, pioSize);

    ranges.push_back(RangeSize(pioAddr, pioSize));

    return ranges;
}

Tick
GpuDispatcher::read(PacketPtr pkt)
{
    assert(pkt->getAddr() >= pioAddr);
    assert(pkt->getAddr() < pioAddr + pioSize);

    int offset = pkt->getAddr() - pioAddr;
    pkt->allocate();

    DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());

    if (offset < 8) {
        assert(!offset);
        assert(pkt->getSize() == 8);

        uint64_t retval = dispatchActive;
        pkt->set(retval);
    } else {
        offset -= 8;
        assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));
        char *curTaskPtr = (char*)&curTask;

        memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());
    }

    pkt->makeAtomicResponse();

    return pioDelay;
}

Tick
GpuDispatcher::write(PacketPtr pkt)
{
    assert(pkt->getAddr() >= pioAddr);
    assert(pkt->getAddr() < pioAddr + pioSize);

    int offset = pkt->getAddr() - pioAddr;

#if TRACING_ON
    uint64_t data_val = 0;

    switch (pkt->getSize()) {
      case 1:
        data_val = pkt->get<uint8_t>();
        break;
      case 2:
        data_val = pkt->get<uint16_t>();
        break;
      case 4:
        data_val = pkt->get<uint32_t>();
        break;
      case 8:
        data_val = pkt->get<uint64_t>();
        break;
      default:
        DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());
    }

    DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,
            pkt->getSize());
#endif
    if (!offset) {
        static int nextId = 0;

        // The depends field of the qstruct, which was previously unused, is
        // used to communicate with simulated application.
        if (curTask.depends) {
            HostState hs;
            shader->ReadMem((uint64_t)(curTask.depends), &hs,
                            sizeof(HostState), 0);

            // update event start time (in nano-seconds)
            uint64_t start = curTick() / 1000;

            shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),
                             &start, sizeof(uint64_t), 0);
        }

        // launch kernel
        ++num_kernelLaunched;

        NDRange *ndr = &(ndRangeMap[nextId]);
        // copy dispatch info
        ndr->q = curTask;

        // update the numDispTask polled by the runtime
        accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);

        ndr->numWgTotal = 1;

        for (int i = 0; i < 3; ++i) {
            ndr->wgId[i] = 0;
            ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);
            ndr->numWgTotal *= ndr->numWg[i];
        }

        ndr->numWgCompleted = 0;
        ndr->globalWgId = 0;
        ndr->wg_disp_rem = true;
        ndr->execDone = false;
        ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;
        ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;
        ndr->dispatchId = nextId;
        ndr->curCid = pkt->req->contextId();
        DPRINTF(GPUDisp, "launching kernel %d\n",nextId);
        execIds.push(nextId);
        ++nextId;

        dispatchActive = true;

        if (!tickEvent.scheduled()) {
            schedule(&tickEvent, curTick() + shader->ticks(1));
        }
    } else {
        // populate current task struct
        // first 64 bits are launch reg
        offset -= 8;
        assert(offset < sizeof(HsaQueueEntry));
        char *curTaskPtr = (char*)&curTask;
        memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());
    }

    pkt->makeAtomicResponse();

    return pioDelay;
}


BaseMasterPort&
GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)
{
    if (if_name == "translation_port") {
        return *tlbPort;
    }

    return DmaDevice::getMasterPort(if_name, idx);
}

void
GpuDispatcher::exec()
{
    int fail_count = 0;

    // There are potentially multiple outstanding kernel launches.
    // It is possible that the workgroups in a different kernel
    // can fit on the GPU even if another kernel's workgroups cannot
    DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());

    while (execIds.size() > fail_count) {
        int execId = execIds.front();

        while (ndRangeMap[execId].wg_disp_rem) {
            //update the thread context
            shader->updateContext(ndRangeMap[execId].curCid);

            // attempt to dispatch_workgroup
            if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {
                // if we failed try the next kernel,
                // it may have smaller workgroups.
                // put it on the queue to rety latter
                DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);
                execIds.push(execId);
                ++fail_count;
                break;
            }
        }
        // let's try the next kernel_id
        execIds.pop();
    }

    DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());

    if (doneIds.size() && cpu) {
        shader->hostWakeUp(cpu);
    }

    while (doneIds.size()) {
        // wakeup the CPU if any Kernels completed this cycle
        DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());
        doneIds.pop();
    }
}

void
GpuDispatcher::notifyWgCompl(Wavefront *w)
{
    int kern_id = w->kernId;
    DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);
    assert(ndRangeMap[kern_id].dispatchId == kern_id);
    ndRangeMap[kern_id].numWgCompleted++;

    if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {
        ndRangeMap[kern_id].execDone = true;
        doneIds.push(kern_id);

        if (ndRangeMap[kern_id].addrToNotify) {
            accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,
                          0);
        }

        accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);

        // update event end time (in nano-seconds)
        if (ndRangeMap[kern_id].q.depends) {
            HostState *host_state = (HostState*)ndRangeMap[kern_id].q.depends;
            uint64_t event;
            shader->ReadMem((uint64_t)(&host_state->event), &event,
                            sizeof(uint64_t), 0);

            uint64_t end = curTick() / 1000;

            shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,
                             sizeof(uint64_t), 0);
        }
    }

    if (!tickEvent.scheduled()) {
        schedule(&tickEvent, curTick() + shader->ticks(1));
    }
}

void
GpuDispatcher::scheduleDispatch()
{
    if (!tickEvent.scheduled())
        schedule(&tickEvent, curTick() + shader->ticks(1));
}

void
GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)
{
    if (cpu) {
        if (off) {
            shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,
                              true);
            val += off;
        }

        shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);
    } else {
        panic("Cannot find host");
    }
}

GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)
    : Event(CPU_Tick_Pri), dispatcher(_dispatcher)
{
}

void
GpuDispatcher::TickEvent::process()
{
    dispatcher->exec();
}

const char*
GpuDispatcher::TickEvent::description() const
{
    return "GPU Dispatcher tick";
}

// helper functions for driver to retrieve GPU attributes
int
GpuDispatcher::getNumCUs()
{
    return shader->cuList.size();
}

int
GpuDispatcher::wfSize() const
{
    return shader->cuList[0]->wfSize();
}

void
GpuDispatcher::setFuncargsSize(int funcargs_size)
{
    shader->funcargs_size = funcargs_size;
}

uint32_t
GpuDispatcher::getStaticContextSize() const
{
    return shader->cuList[0]->wfList[0][0]->getStaticContextSize();
}
gpu-compute: AMD's baseline GPU model 2016-01-19 20:28:22 +01:00			`/*`
			`* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.`
			`* All rights reserved.`
			`*`
			`* For use for simulation and test purposes only`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions are met:`
			`*`
			`* 1. Redistributions of source code must retain the above copyright notice,`
			`* this list of conditions and the following disclaimer.`
			`*`
			`* 2. Redistributions in binary form must reproduce the above copyright notice,`
			`* this list of conditions and the following disclaimer in the documentation`
			`* and/or other materials provided with the distribution.`
			`*`
			`* 3. Neither the name of the copyright holder nor the names of its contributors`
			`* may be used to endorse or promote products derived from this software`
			`* without specific prior written permission.`
			`*`
			`* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"`
			`* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE`
			`* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`* POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`* Author: Brad Beckmann, Marc Orr`
			`*/`


			`#include "gpu-compute/dispatcher.hh"`

			`#include "cpu/base.hh"`
			`#include "debug/GPUDisp.hh"`
			`#include "gpu-compute/cl_driver.hh"`
			`#include "gpu-compute/cl_event.hh"`
			`#include "gpu-compute/shader.hh"`
			`#include "gpu-compute/wavefront.hh"`
			`#include "mem/packet_access.hh"`

			`GpuDispatcher *GpuDispatcher::instance = nullptr;`

			`GpuDispatcher::GpuDispatcher(const Params *p)`
			`: DmaDevice(p), _masterId(p->system->getMasterId(name() + ".disp")),`
			`pioAddr(p->pio_addr), pioSize(4096), pioDelay(p->pio_latency),`
			`dispatchCount(0), dispatchActive(false), cpu(p->cpu),`
			`shader(p->shader_pointer), driver(p->cl_driver), tickEvent(this)`
			`{`
			`shader->handshake(this);`
			`driver->handshake(this);`

			`ndRange.wg_disp_rem = false;`
			`ndRange.globalWgId = 0;`

			`schedule(&tickEvent, 0);`

			`// translation port for the dispatcher`
			`tlbPort = new TLBPort(csprintf("%s-port%d", name()), this);`

			`num_kernelLaunched`
			`.name(name() + ".num_kernel_launched")`
			`.desc("number of kernel launched")`
			`;`
			`}`

			`GpuDispatcher *GpuDispatcherParams::create()`
			`{`
			`GpuDispatcher *dispatcher = new GpuDispatcher(this);`
			`GpuDispatcher::setInstance(dispatcher);`

			`return GpuDispatcher::getInstance();`
			`}`

			`void`
			`GpuDispatcher::serialize(CheckpointOut &cp) const`
			`{`
			`Tick event_tick = 0;`

			`if (ndRange.wg_disp_rem)`
			`fatal("Checkpointing not supported during active workgroup execution");`

			`if (tickEvent.scheduled())`
			`event_tick = tickEvent.when();`

			`SERIALIZE_SCALAR(event_tick);`

			`}`

			`void`
			`GpuDispatcher::unserialize(CheckpointIn &cp)`
			`{`
			`Tick event_tick;`

			`if (tickEvent.scheduled())`
			`deschedule(&tickEvent);`

			`UNSERIALIZE_SCALAR(event_tick);`

			`if (event_tick)`
			`schedule(&tickEvent, event_tick);`
			`}`

			`AddrRangeList`
			`GpuDispatcher::getAddrRanges() const`
			`{`
			`AddrRangeList ranges;`

			`DPRINTF(GPUDisp, "dispatcher registering addr range at %#x size %#x\n",`
			`pioAddr, pioSize);`

			`ranges.push_back(RangeSize(pioAddr, pioSize));`

			`return ranges;`
			`}`

			`Tick`
			`GpuDispatcher::read(PacketPtr pkt)`
			`{`
			`assert(pkt->getAddr() >= pioAddr);`
			`assert(pkt->getAddr() < pioAddr + pioSize);`

			`int offset = pkt->getAddr() - pioAddr;`
			`pkt->allocate();`

			`DPRINTF(GPUDisp, " read register %#x size=%d\n", offset, pkt->getSize());`

			`if (offset < 8) {`
			`assert(!offset);`
			`assert(pkt->getSize() == 8);`

			`uint64_t retval = dispatchActive;`
			`pkt->set(retval);`
			`} else {`
			`offset -= 8;`
			`assert(offset + pkt->getSize() < sizeof(HsaQueueEntry));`
			`char curTaskPtr = (char)&curTask;`

			`memcpy(pkt->getPtr<const void*>(), curTaskPtr + offset, pkt->getSize());`
			`}`

			`pkt->makeAtomicResponse();`

			`return pioDelay;`
			`}`

			`Tick`
			`GpuDispatcher::write(PacketPtr pkt)`
			`{`
			`assert(pkt->getAddr() >= pioAddr);`
			`assert(pkt->getAddr() < pioAddr + pioSize);`

			`int offset = pkt->getAddr() - pioAddr;`

			`#if TRACING_ON`
			`uint64_t data_val = 0;`

			`switch (pkt->getSize()) {`
			`case 1:`
			`data_val = pkt->get<uint8_t>();`
			`break;`
			`case 2:`
			`data_val = pkt->get<uint16_t>();`
			`break;`
			`case 4:`
			`data_val = pkt->get<uint32_t>();`
			`break;`
			`case 8:`
			`data_val = pkt->get<uint64_t>();`
			`break;`
			`default:`
			`DPRINTF(GPUDisp, "bad size %d\n", pkt->getSize());`
			`}`

			`DPRINTF(GPUDisp, "write register %#x value %#x size=%d\n", offset, data_val,`
			`pkt->getSize());`
			`#endif`
			`if (!offset) {`
			`static int nextId = 0;`

			`// The depends field of the qstruct, which was previously unused, is`
			`// used to communicate with simulated application.`
			`if (curTask.depends) {`
			`HostState hs;`
			`shader->ReadMem((uint64_t)(curTask.depends), &hs,`
			`sizeof(HostState), 0);`

			`// update event start time (in nano-seconds)`
			`uint64_t start = curTick() / 1000;`

			`shader->WriteMem((uint64_t)(&((_cl_event*)hs.event)->start),`
			`&start, sizeof(uint64_t), 0);`
			`}`

			`// launch kernel`
			`++num_kernelLaunched;`

			`NDRange *ndr = &(ndRangeMap[nextId]);`
			`// copy dispatch info`
			`ndr->q = curTask;`

			`// update the numDispTask polled by the runtime`
			`accessUserVar(cpu, (uint64_t)(curTask.numDispLeft), 0, 1);`

			`ndr->numWgTotal = 1;`

			`for (int i = 0; i < 3; ++i) {`
			`ndr->wgId[i] = 0;`
			`ndr->numWg[i] = divCeil(curTask.gdSize[i], curTask.wgSize[i]);`
			`ndr->numWgTotal *= ndr->numWg[i];`
			`}`

			`ndr->numWgCompleted = 0;`
			`ndr->globalWgId = 0;`
			`ndr->wg_disp_rem = true;`
			`ndr->execDone = false;`
			`ndr->addrToNotify = (volatile bool*)curTask.addrToNotify;`
			`ndr->numDispLeft = (volatile uint32_t*)curTask.numDispLeft;`
			`ndr->dispatchId = nextId;`
mem: Remove threadId from memory request class In general, the ThreadID parameter is unnecessary in the memory system as the ContextID is what is used for the purposes of locks/wakeups. Since we allocate sequential ContextIDs for each thread on MT-enabled CPUs, ThreadID is unnecessary as the CPUs can identify the requesting thread through sideband info (SenderState / LSQ entries) or ContextID offset from the base ContextID for a cpu. This is a re-spin of 20264eb after the revert (bd1c6789) and includes some fixes of that commit. 2016-04-07 16:30:20 +02:00			`ndr->curCid = pkt->req->contextId();`
gpu-compute: AMD's baseline GPU model 2016-01-19 20:28:22 +01:00			`DPRINTF(GPUDisp, "launching kernel %d\n",nextId);`
			`execIds.push(nextId);`
			`++nextId;`

			`dispatchActive = true;`

			`if (!tickEvent.scheduled()) {`
			`schedule(&tickEvent, curTick() + shader->ticks(1));`
			`}`
			`} else {`
			`// populate current task struct`
			`// first 64 bits are launch reg`
			`offset -= 8;`
			`assert(offset < sizeof(HsaQueueEntry));`
			`char curTaskPtr = (char)&curTask;`
			`memcpy(curTaskPtr + offset, pkt->getPtr<const void*>(), pkt->getSize());`
			`}`

			`pkt->makeAtomicResponse();`

			`return pioDelay;`
			`}`


			`BaseMasterPort&`
			`GpuDispatcher::getMasterPort(const std::string &if_name, PortID idx)`
			`{`
			`if (if_name == "translation_port") {`
			`return *tlbPort;`
			`}`

			`return DmaDevice::getMasterPort(if_name, idx);`
			`}`

			`void`
			`GpuDispatcher::exec()`
			`{`
			`int fail_count = 0;`

			`// There are potentially multiple outstanding kernel launches.`
			`// It is possible that the workgroups in a different kernel`
			`// can fit on the GPU even if another kernel's workgroups cannot`
			`DPRINTF(GPUDisp, "Launching %d Kernels\n", execIds.size());`

			`while (execIds.size() > fail_count) {`
			`int execId = execIds.front();`

			`while (ndRangeMap[execId].wg_disp_rem) {`
			`//update the thread context`
mem: Remove threadId from memory request class In general, the ThreadID parameter is unnecessary in the memory system as the ContextID is what is used for the purposes of locks/wakeups. Since we allocate sequential ContextIDs for each thread on MT-enabled CPUs, ThreadID is unnecessary as the CPUs can identify the requesting thread through sideband info (SenderState / LSQ entries) or ContextID offset from the base ContextID for a cpu. This is a re-spin of 20264eb after the revert (bd1c6789) and includes some fixes of that commit. 2016-04-07 16:30:20 +02:00			`shader->updateContext(ndRangeMap[execId].curCid);`
gpu-compute: AMD's baseline GPU model 2016-01-19 20:28:22 +01:00
			`// attempt to dispatch_workgroup`
			`if (!shader->dispatch_workgroups(&ndRangeMap[execId])) {`
			`// if we failed try the next kernel,`
			`// it may have smaller workgroups.`
			`// put it on the queue to rety latter`
			`DPRINTF(GPUDisp, "kernel %d failed to launch\n", execId);`
			`execIds.push(execId);`
			`++fail_count;`
			`break;`
			`}`
			`}`
			`// let's try the next kernel_id`
			`execIds.pop();`
			`}`

			`DPRINTF(GPUDisp, "Returning %d Kernels\n", doneIds.size());`

			`if (doneIds.size() && cpu) {`
			`shader->hostWakeUp(cpu);`
			`}`

			`while (doneIds.size()) {`
			`// wakeup the CPU if any Kernels completed this cycle`
			`DPRINTF(GPUDisp, "WorkGroup %d completed\n", doneIds.front());`
			`doneIds.pop();`
			`}`
			`}`

			`void`
			`GpuDispatcher::notifyWgCompl(Wavefront *w)`
			`{`
gpu-compute: Wavefront refactoring Renaming members of the Wavefront class in accordance with the style guide. 2016-09-16 18:26:52 +02:00			`int kern_id = w->kernId;`
gpu-compute: AMD's baseline GPU model 2016-01-19 20:28:22 +01:00			`DPRINTF(GPUDisp, "notify WgCompl %d\n",kern_id);`
			`assert(ndRangeMap[kern_id].dispatchId == kern_id);`
			`ndRangeMap[kern_id].numWgCompleted++;`

			`if (ndRangeMap[kern_id].numWgCompleted == ndRangeMap[kern_id].numWgTotal) {`
			`ndRangeMap[kern_id].execDone = true;`
			`doneIds.push(kern_id);`

			`if (ndRangeMap[kern_id].addrToNotify) {`
			`accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].addrToNotify), 1,`
			`0);`
			`}`

			`accessUserVar(cpu, (uint64_t)(ndRangeMap[kern_id].numDispLeft), 0, -1);`

			`// update event end time (in nano-seconds)`
			`if (ndRangeMap[kern_id].q.depends) {`
			`HostState host_state = (HostState)ndRangeMap[kern_id].q.depends;`
			`uint64_t event;`
			`shader->ReadMem((uint64_t)(&host_state->event), &event,`
			`sizeof(uint64_t), 0);`

			`uint64_t end = curTick() / 1000;`

			`shader->WriteMem((uint64_t)(&((_cl_event*)event)->end), &end,`
			`sizeof(uint64_t), 0);`
			`}`
			`}`

			`if (!tickEvent.scheduled()) {`
			`schedule(&tickEvent, curTick() + shader->ticks(1));`
			`}`
			`}`

			`void`
			`GpuDispatcher::scheduleDispatch()`
			`{`
			`if (!tickEvent.scheduled())`
			`schedule(&tickEvent, curTick() + shader->ticks(1));`
			`}`

			`void`
			`GpuDispatcher::accessUserVar(BaseCPU *cpu, uint64_t addr, int val, int off)`
			`{`
			`if (cpu) {`
			`if (off) {`
			`shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::ReadReq,`
			`true);`
			`val += off;`
			`}`

			`shader->AccessMem(addr, &val, sizeof(int), 0, MemCmd::WriteReq, true);`
			`} else {`
			`panic("Cannot find host");`
			`}`
			`}`

			`GpuDispatcher::TickEvent::TickEvent(GpuDispatcher *_dispatcher)`
			`: Event(CPU_Tick_Pri), dispatcher(_dispatcher)`
			`{`
			`}`

			`void`
			`GpuDispatcher::TickEvent::process()`
			`{`
			`dispatcher->exec();`
			`}`

			`const char*`
			`GpuDispatcher::TickEvent::description() const`
			`{`
			`return "GPU Dispatcher tick";`
			`}`

			`// helper functions for driver to retrieve GPU attributes`
			`int`
			`GpuDispatcher::getNumCUs()`
			`{`
			`return shader->cuList.size();`
			`}`

gpu-compute: parametrize Wavefront size Eliminate the VSZ constant that defined the Wavefront size (in numbers of work items); replaced it with a parameter in the GPU.py configuration script. Changed all data structures dependent on the Wavefront size to be dynamically sized. Legal values of Wavefront size are 16, 32, 64 for now and checked at initialization time. 2016-06-09 17:24:55 +02:00			`int`
			`GpuDispatcher::wfSize() const`
			`{`
			`return shader->cuList[0]->wfSize();`
			`}`

gpu-compute: AMD's baseline GPU model 2016-01-19 20:28:22 +01:00			`void`
			`GpuDispatcher::setFuncargsSize(int funcargs_size)`
			`{`
			`shader->funcargs_size = funcargs_size;`
			`}`
gpu-compute: Adding ioctl for HW context size Adding runtime support for determining the memory required by a SIMD engine when executing a particular wavefront. 2016-09-16 18:27:56 +02:00
			`uint32_t`
gpu-compute: fix typo in GPUDispatcher 2016-09-16 20:47:19 +02:00			`GpuDispatcher::getStaticContextSize() const`
gpu-compute: Adding ioctl for HW context size Adding runtime support for determining the memory required by a SIMD engine when executing a particular wavefront. 2016-09-16 18:27:56 +02:00			`{`
			`return shader->cuList[0]->wfList[0][0]->getStaticContextSize();`
			`}`