7ac38849ab
this patch removes the GPUStaticInst enums that were defined in GPU.py. instead, a simple set of attribute flags that can be set in the base instruction class are used. this will help unify the attributes of HSAIL and machine ISA instructions within the model itself. because the static instrution now carries the attributes, a GPUDynInst must carry a pointer to a valid GPUStaticInst so a new static kernel launch instruction is added, which carries the attributes needed to perform a the kernel launch.
985 lines
33 KiB
C++
985 lines
33 KiB
C++
/*
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* Author: Lisa Hsu
|
|
*/
|
|
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
#include "debug/GPUExec.hh"
|
|
#include "debug/WavefrontStack.hh"
|
|
#include "gpu-compute/compute_unit.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/vector_register_file.hh"
|
|
|
|
Wavefront*
|
|
WavefrontParams::create()
|
|
{
|
|
return new Wavefront(this);
|
|
}
|
|
|
|
Wavefront::Wavefront(const Params *p)
|
|
: SimObject(p), callArgMem(nullptr)
|
|
{
|
|
lastTrace = 0;
|
|
simdId = p->simdId;
|
|
wfSlotId = p->wf_slot_id;
|
|
status = S_STOPPED;
|
|
reservedVectorRegs = 0;
|
|
startVgprIndex = 0;
|
|
outstandingReqs = 0;
|
|
memReqsInPipe = 0;
|
|
outstandingReqsWrGm = 0;
|
|
outstandingReqsWrLm = 0;
|
|
outstandingReqsRdGm = 0;
|
|
outstandingReqsRdLm = 0;
|
|
rdLmReqsInPipe = 0;
|
|
rdGmReqsInPipe = 0;
|
|
wrLmReqsInPipe = 0;
|
|
wrGmReqsInPipe = 0;
|
|
|
|
barrierCnt = 0;
|
|
oldBarrierCnt = 0;
|
|
stalledAtBarrier = false;
|
|
|
|
memTraceBusy = 0;
|
|
oldVgprTcnt = 0xffffffffffffffffll;
|
|
oldDgprTcnt = 0xffffffffffffffffll;
|
|
oldVgpr.resize(p->wfSize);
|
|
|
|
pendingFetch = false;
|
|
dropFetch = false;
|
|
condRegState = new ConditionRegisterState();
|
|
maxSpVgprs = 0;
|
|
maxDpVgprs = 0;
|
|
lastAddr.resize(p->wfSize);
|
|
workItemFlatId.resize(p->wfSize);
|
|
oldDgpr.resize(p->wfSize);
|
|
barCnt.resize(p->wfSize);
|
|
for (int i = 0; i < 3; ++i) {
|
|
workItemId[i].resize(p->wfSize);
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::regStats()
|
|
{
|
|
SimObject::regStats();
|
|
|
|
srcRegOpDist
|
|
.init(0, 4, 2)
|
|
.name(name() + ".src_reg_operand_dist")
|
|
.desc("number of executed instructions with N source register operands")
|
|
;
|
|
|
|
dstRegOpDist
|
|
.init(0, 3, 2)
|
|
.name(name() + ".dst_reg_operand_dist")
|
|
.desc("number of executed instructions with N destination register "
|
|
"operands")
|
|
;
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
numTimesBlockedDueWAXDependencies
|
|
.name(name() + ".timesBlockedDueWAXDependencies")
|
|
.desc("number of times the wf's instructions are blocked due to WAW "
|
|
"or WAR dependencies")
|
|
;
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
numTimesBlockedDueRAWDependencies
|
|
.name(name() + ".timesBlockedDueRAWDependencies")
|
|
.desc("number of times the wf's instructions are blocked due to RAW "
|
|
"dependencies")
|
|
;
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
numTimesBlockedDueVrfPortAvail
|
|
.name(name() + ".timesBlockedDueVrfPortAvail")
|
|
.desc("number of times instructions are blocked due to VRF port "
|
|
"availability")
|
|
;
|
|
}
|
|
|
|
void
|
|
Wavefront::init()
|
|
{
|
|
reservedVectorRegs = 0;
|
|
startVgprIndex = 0;
|
|
}
|
|
|
|
void
|
|
Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
|
|
{
|
|
condRegState->init(num_cregs);
|
|
maxSpVgprs = num_sregs;
|
|
maxDpVgprs = num_dregs;
|
|
}
|
|
|
|
Wavefront::~Wavefront()
|
|
{
|
|
if (callArgMem)
|
|
delete callArgMem;
|
|
delete condRegState;
|
|
}
|
|
|
|
void
|
|
Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
|
|
{
|
|
wfDynId = _wf_dyn_id;
|
|
basePtr = _base_ptr;
|
|
status = S_RUNNING;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isGmInstruction(GPUDynInstPtr ii)
|
|
{
|
|
if (ii->isGlobalMem() || ii->isFlat())
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isLmInstruction(GPUDynInstPtr ii)
|
|
{
|
|
if (ii->isLocalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstALU()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && (ii->isNop() ||
|
|
ii->isReturn() || ii->isBranch() ||
|
|
ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstBarrier()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isBarrier()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstGMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isGlobalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstLMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isLocalMem()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstPrivMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isPrivateSeg()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
Wavefront::isOldestInstFlatMem()
|
|
{
|
|
assert(!instructionBuffer.empty());
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
if (status != S_STOPPED && ii->isFlat()) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Return true if the Wavefront's instruction
|
|
// buffer has branch instruction.
|
|
bool
|
|
Wavefront::instructionBufferHasBranch()
|
|
{
|
|
for (auto it : instructionBuffer) {
|
|
GPUDynInstPtr ii = it;
|
|
|
|
if (ii->isReturn() || ii->isBranch()) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Remap HSAIL register to physical VGPR.
|
|
// HSAIL register = virtual register assigned to an operand by HLC compiler
|
|
uint32_t
|
|
Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
|
|
{
|
|
assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
|
|
// add the offset from where the VGPRs of the wavefront have been assigned
|
|
uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
|
|
// HSAIL double precision (DP) register: calculate the physical VGPR index
|
|
// assuming that DP registers are placed after SP ones in the VRF. The DP
|
|
// and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
|
|
// the DP VGPR index before mapping it to the physical VRF address space
|
|
if (mode == 1 && size > 4) {
|
|
physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
|
|
}
|
|
|
|
assert((startVgprIndex <= physicalVgprIndex) &&
|
|
(startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
|
|
|
|
// calculate absolute physical VGPR index
|
|
return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
|
|
}
|
|
|
|
// Return true if this wavefront is ready
|
|
// to execute an instruction of the specified type.
|
|
int
|
|
Wavefront::ready(itype_e type)
|
|
{
|
|
// Check to make sure wave is running
|
|
if (status == S_STOPPED || status == S_RETURNING ||
|
|
instructionBuffer.empty()) {
|
|
return 0;
|
|
}
|
|
|
|
// Is the wave waiting at a barrier
|
|
if (stalledAtBarrier) {
|
|
if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
|
|
computeUnit->getRefCounter(dispatchId, wgId))) {
|
|
// Are all threads at barrier?
|
|
return 0;
|
|
}
|
|
oldBarrierCnt = barrierCnt;
|
|
stalledAtBarrier = false;
|
|
}
|
|
|
|
// Read instruction
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
bool ready_inst M5_VAR_USED = false;
|
|
bool glbMemBusRdy = false;
|
|
bool glbMemIssueRdy = false;
|
|
if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
|
|
for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
|
|
if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
|
|
glbMemBusRdy = true;
|
|
if (computeUnit->wfWait[j].prerdy())
|
|
glbMemIssueRdy = true;
|
|
}
|
|
}
|
|
bool locMemBusRdy = false;
|
|
bool locMemIssueRdy = false;
|
|
if (type == I_SHARED || type == I_FLAT) {
|
|
for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
|
|
if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
|
|
locMemBusRdy = true;
|
|
if (computeUnit->wfWait[j].prerdy())
|
|
locMemIssueRdy = true;
|
|
}
|
|
}
|
|
|
|
// The following code is very error prone and the entire process for
|
|
// checking readiness will be fixed eventually. In the meantime, let's
|
|
// make sure that we do not silently let an instruction type slip
|
|
// through this logic and always return not ready.
|
|
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
|
|
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
|
|
ii->isMemFence() || ii->isFlat())) {
|
|
panic("next instruction: %s is of unknown type\n", ii->disassemble());
|
|
}
|
|
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
|
|
computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
|
|
|
|
if (type == I_ALU && ii->isBarrier()) {
|
|
// Here for ALU instruction (barrier)
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
// Is wave slot free?
|
|
return 0;
|
|
}
|
|
|
|
// Are there in pipe or outstanding memory requests?
|
|
if ((outstandingReqs + memReqsInPipe) > 0) {
|
|
return 0;
|
|
}
|
|
|
|
ready_inst = true;
|
|
} else if (type == I_ALU && ii->isNop()) {
|
|
// Here for ALU instruction (nop)
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
// Is wave slot free?
|
|
return 0;
|
|
}
|
|
|
|
ready_inst = true;
|
|
} else if (type == I_ALU && ii->isReturn()) {
|
|
// Here for ALU instruction (return)
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
// Is wave slot free?
|
|
return 0;
|
|
}
|
|
|
|
// Are there in pipe or outstanding memory requests?
|
|
if ((outstandingReqs + memReqsInPipe) > 0) {
|
|
return 0;
|
|
}
|
|
|
|
ready_inst = true;
|
|
} else if (type == I_ALU && (ii->isBranch() ||
|
|
ii->isALU() ||
|
|
(ii->isKernArgSeg() && ii->isLoad()) ||
|
|
ii->isArgSeg())) {
|
|
// Here for ALU instruction (all others)
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
// Is alu slot free?
|
|
return 0;
|
|
}
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
VrfAccessType::RD_WR)) {
|
|
return 0;
|
|
}
|
|
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
return 0;
|
|
}
|
|
ready_inst = true;
|
|
} else if (type == I_GLOBAL && ii->isGlobalMem()) {
|
|
// Here Global memory instruction
|
|
if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
|
|
// Are there in pipe or outstanding global memory write requests?
|
|
if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
|
|
// Are there in pipe or outstanding global memory read requests?
|
|
if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
|
|
return 0;
|
|
}
|
|
|
|
if (!glbMemIssueRdy) {
|
|
// Is WV issue slot free?
|
|
return 0;
|
|
}
|
|
|
|
if (!glbMemBusRdy) {
|
|
// Is there an available VRF->Global memory read bus?
|
|
return 0;
|
|
}
|
|
|
|
if (!computeUnit->globalMemoryPipe.
|
|
isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
|
|
// Can we insert a new request to the Global Mem Request FIFO?
|
|
return 0;
|
|
}
|
|
// can we schedule source & destination operands on the VRF?
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
VrfAccessType::RD_WR)) {
|
|
return 0;
|
|
}
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
return 0;
|
|
}
|
|
ready_inst = true;
|
|
} else if (type == I_SHARED && ii->isLocalMem()) {
|
|
// Here for Shared memory instruction
|
|
if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
|
|
if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
|
|
if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (!locMemBusRdy) {
|
|
// Is there an available VRF->LDS read bus?
|
|
return 0;
|
|
}
|
|
if (!locMemIssueRdy) {
|
|
// Is wave slot free?
|
|
return 0;
|
|
}
|
|
|
|
if (!computeUnit->localMemoryPipe.
|
|
isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
|
|
// Can we insert a new request to the LDS Request FIFO?
|
|
return 0;
|
|
}
|
|
// can we schedule source & destination operands on the VRF?
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
VrfAccessType::RD_WR)) {
|
|
return 0;
|
|
}
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
return 0;
|
|
}
|
|
ready_inst = true;
|
|
} else if (type == I_FLAT && ii->isFlat()) {
|
|
if (!glbMemBusRdy) {
|
|
// Is there an available VRF->Global memory read bus?
|
|
return 0;
|
|
}
|
|
|
|
if (!locMemBusRdy) {
|
|
// Is there an available VRF->LDS read bus?
|
|
return 0;
|
|
}
|
|
|
|
if (!glbMemIssueRdy) {
|
|
// Is wave slot free?
|
|
return 0;
|
|
}
|
|
|
|
if (!locMemIssueRdy) {
|
|
return 0;
|
|
}
|
|
if (!computeUnit->globalMemoryPipe.
|
|
isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
|
|
// Can we insert a new request to the Global Mem Request FIFO?
|
|
return 0;
|
|
}
|
|
|
|
if (!computeUnit->localMemoryPipe.
|
|
isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
|
|
// Can we insert a new request to the LDS Request FIFO?
|
|
return 0;
|
|
}
|
|
// can we schedule source & destination operands on the VRF?
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
VrfAccessType::RD_WR)) {
|
|
return 0;
|
|
}
|
|
// are all the operands ready? (RAW, WAW and WAR depedencies met?)
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
return 0;
|
|
}
|
|
ready_inst = true;
|
|
} else {
|
|
return 0;
|
|
}
|
|
|
|
assert(ready_inst);
|
|
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
|
|
simdId, wfSlotId, ii->disassemble());
|
|
return 1;
|
|
}
|
|
|
|
void
|
|
Wavefront::updateResources()
|
|
{
|
|
// Get current instruction
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
assert(ii);
|
|
computeUnit->vrf[simdId]->updateResources(this, ii);
|
|
// Single precision ALU or Branch or Return or Special instruction
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
ii->isBranch() ||
|
|
// FIXME: Kernel argument loads are currently treated as ALU operations
|
|
// since we don't send memory packets at execution. If we fix that then
|
|
// we should map them to one of the memory pipelines
|
|
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
|
|
ii->isReturn()) {
|
|
computeUnit->aluPipe[simdId].preset(computeUnit->shader->
|
|
ticks(computeUnit->spBypassLength()));
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
computeUnit->wfWait[simdId].preset(computeUnit->shader->
|
|
ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isBarrier()) {
|
|
computeUnit->wfWait[simdId].preset(computeUnit->shader->
|
|
ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isLoad() && ii->isFlat()) {
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
|
memReqsInPipe++;
|
|
rdGmReqsInPipe++;
|
|
if ( Enums::SC_SHARED == ii->executedAs() ) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
preset(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
preset(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
}
|
|
} else if (ii->isStore() && ii->isFlat()) {
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
|
memReqsInPipe++;
|
|
wrGmReqsInPipe++;
|
|
if (Enums::SC_SHARED == ii->executedAs()) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
preset(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
preset(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
}
|
|
} else if (ii->isLoad() && ii->isGlobalMem()) {
|
|
memReqsInPipe++;
|
|
rdGmReqsInPipe++;
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
preset(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isStore() && ii->isGlobalMem()) {
|
|
memReqsInPipe++;
|
|
wrGmReqsInPipe++;
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
preset(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
|
|
memReqsInPipe++;
|
|
wrGmReqsInPipe++;
|
|
rdGmReqsInPipe++;
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
preset(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isLoad() && ii->isLocalMem()) {
|
|
memReqsInPipe++;
|
|
rdLmReqsInPipe++;
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
preset(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isStore() && ii->isLocalMem()) {
|
|
memReqsInPipe++;
|
|
wrLmReqsInPipe++;
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
preset(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
|
|
memReqsInPipe++;
|
|
wrLmReqsInPipe++;
|
|
rdLmReqsInPipe++;
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
preset(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::exec()
|
|
{
|
|
// ---- Exit if wavefront is inactive ----------------------------- //
|
|
|
|
if (status == S_STOPPED || status == S_RETURNING ||
|
|
instructionBuffer.empty()) {
|
|
return;
|
|
}
|
|
|
|
// Get current instruction
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
const uint32_t old_pc = pc();
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
|
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
|
ii->disassemble(), old_pc);
|
|
ii->execute(ii);
|
|
// access the VRF
|
|
computeUnit->vrf[simdId]->exec(ii, this);
|
|
srcRegOpDist.sample(ii->numSrcRegOperands());
|
|
dstRegOpDist.sample(ii->numDstRegOperands());
|
|
computeUnit->numInstrExecuted++;
|
|
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
|
|
computeUnit->lastExecCycle[simdId]);
|
|
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
|
|
if (pc() == old_pc) {
|
|
uint32_t new_pc = old_pc + 1;
|
|
// PC not modified by instruction, proceed to next or pop frame
|
|
pc(new_pc);
|
|
if (new_pc == rpc()) {
|
|
popFromReconvergenceStack();
|
|
discardFetch();
|
|
} else {
|
|
instructionBuffer.pop_front();
|
|
}
|
|
}
|
|
|
|
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
|
|
const int num_active_lanes = execMask().count();
|
|
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
|
|
computeUnit->numVecOpsExecuted += num_active_lanes;
|
|
if (isGmInstruction(ii)) {
|
|
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
|
|
} else if (isLmInstruction(ii)) {
|
|
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
|
|
}
|
|
}
|
|
|
|
// ---- Update Vector ALU pipeline and other resources ------------------ //
|
|
// Single precision ALU or Branch or Return or Special instruction
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
ii->isBranch() ||
|
|
// FIXME: Kernel argument loads are currently treated as ALU operations
|
|
// since we don't send memory packets at execution. If we fix that then
|
|
// we should map them to one of the memory pipelines
|
|
(ii->isKernArgSeg() && ii->isLoad()) ||
|
|
ii->isArgSeg() ||
|
|
ii->isReturn()) {
|
|
computeUnit->aluPipe[simdId].set(computeUnit->shader->
|
|
ticks(computeUnit->spBypassLength()));
|
|
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
computeUnit->wfWait[simdId].set(computeUnit->shader->
|
|
ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isBarrier()) {
|
|
computeUnit->wfWait[simdId].set(computeUnit->shader->
|
|
ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isLoad() && ii->isFlat()) {
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
|
|
|
if (Enums::SC_SHARED == ii->executedAs()) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
set(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
set(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
}
|
|
} else if (ii->isStore() && ii->isFlat()) {
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
|
if (Enums::SC_SHARED == ii->executedAs()) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
set(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
set(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
}
|
|
} else if (ii->isLoad() && ii->isGlobalMem()) {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
set(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isStore() && ii->isGlobalMem()) {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
set(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
set(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isLoad() && ii->isLocalMem()) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
set(computeUnit->shader->ticks(4));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if (ii->isStore() && ii->isLocalMem()) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
set(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
set(computeUnit->shader->ticks(8));
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
}
|
|
}
|
|
|
|
bool
|
|
Wavefront::waitingAtBarrier(int lane)
|
|
{
|
|
return barCnt[lane] < maxBarCnt;
|
|
}
|
|
|
|
void
|
|
Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
|
|
const VectorMask& mask)
|
|
{
|
|
assert(mask.count());
|
|
reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
|
|
}
|
|
|
|
void
|
|
Wavefront::popFromReconvergenceStack()
|
|
{
|
|
assert(!reconvergenceStack.empty());
|
|
|
|
DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
|
|
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
|
execMask().to_string<char, std::string::traits_type,
|
|
std::string::allocator_type>().c_str(), pc());
|
|
|
|
reconvergenceStack.pop_back();
|
|
|
|
DPRINTF(WavefrontStack, "%3i %s\n", pc(),
|
|
execMask().to_string<char, std::string::traits_type,
|
|
std::string::allocator_type>().c_str());
|
|
|
|
}
|
|
|
|
void
|
|
Wavefront::discardFetch()
|
|
{
|
|
instructionBuffer.clear();
|
|
dropFetch |=pendingFetch;
|
|
}
|
|
|
|
uint32_t
|
|
Wavefront::pc() const
|
|
{
|
|
return reconvergenceStack.back()->pc;
|
|
}
|
|
|
|
uint32_t
|
|
Wavefront::rpc() const
|
|
{
|
|
return reconvergenceStack.back()->rpc;
|
|
}
|
|
|
|
VectorMask
|
|
Wavefront::execMask() const
|
|
{
|
|
return reconvergenceStack.back()->execMask;
|
|
}
|
|
|
|
bool
|
|
Wavefront::execMask(int lane) const
|
|
{
|
|
return reconvergenceStack.back()->execMask[lane];
|
|
}
|
|
|
|
|
|
void
|
|
Wavefront::pc(uint32_t new_pc)
|
|
{
|
|
reconvergenceStack.back()->pc = new_pc;
|
|
}
|
|
|
|
uint32_t
|
|
Wavefront::getStaticContextSize() const
|
|
{
|
|
return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
|
|
sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
|
|
sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
|
|
sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
|
|
computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
|
|
}
|
|
|
|
void
|
|
Wavefront::getContext(const void *out)
|
|
{
|
|
uint8_t *iter = (uint8_t *)out;
|
|
for (int i = 0; i < barCnt.size(); i++) {
|
|
*(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
|
|
}
|
|
*(int *)iter = wfId; iter += sizeof(wfId);
|
|
*(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
|
|
*(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
|
|
*(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
|
|
*(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
|
|
*(uint32_t *)iter = wgId; iter += sizeof(wgId);
|
|
*(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
|
|
*(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
|
|
*(Addr *)iter = privBase; iter += sizeof(privBase);
|
|
*(Addr *)iter = spillBase; iter += sizeof(spillBase);
|
|
|
|
int stackSize = reconvergenceStack.size();
|
|
ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
|
|
std::numeric_limits<uint32_t>::max(),
|
|
std::numeric_limits<uint64_t>::max()};
|
|
for (int i = 0; i < workItemId[0].size(); i++) {
|
|
if (i < stackSize) {
|
|
*(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
|
|
iter += sizeof(ReconvergenceStackEntry);
|
|
reconvergenceStack.pop_back();
|
|
} else {
|
|
*(ReconvergenceStackEntry *)iter = empty;
|
|
iter += sizeof(ReconvergenceStackEntry);
|
|
}
|
|
}
|
|
|
|
int wf_size = computeUnit->wfSize();
|
|
for (int i = 0; i < maxSpVgprs; i++) {
|
|
uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
uint32_t regVal = computeUnit->vrf[simdId]->
|
|
read<uint32_t>(vgprIdx,lane);
|
|
*(uint32_t *)iter = regVal; iter += sizeof(regVal);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < maxDpVgprs; i++) {
|
|
uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
uint64_t regVal = computeUnit->vrf[simdId]->
|
|
read<uint64_t>(vgprIdx,lane);
|
|
*(uint64_t *)iter = regVal; iter += sizeof(regVal);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < condRegState->numRegs(); i++) {
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
uint64_t regVal = condRegState->read<uint64_t>(i, lane);
|
|
*(uint64_t *)iter = regVal; iter += sizeof(regVal);
|
|
}
|
|
}
|
|
|
|
/* saving LDS content */
|
|
if (ldsChunk)
|
|
for (int i = 0; i < ldsChunk->size(); i++) {
|
|
char val = ldsChunk->read<char>(i);
|
|
*(char *) iter = val; iter += sizeof(val);
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::setContext(const void *in)
|
|
{
|
|
uint8_t *iter = (uint8_t *)in;
|
|
for (int i = 0; i < barCnt.size(); i++) {
|
|
barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
|
|
}
|
|
wfId = *(int *)iter; iter += sizeof(wfId);
|
|
maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
|
|
oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
|
|
barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
|
|
computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
|
|
wgId = *(uint32_t *)iter; iter += sizeof(wgId);
|
|
barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
|
|
initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
|
|
privBase = *(Addr *)iter; iter += sizeof(privBase);
|
|
spillBase = *(Addr *)iter; iter += sizeof(spillBase);
|
|
|
|
for (int i = 0; i < workItemId[0].size(); i++) {
|
|
ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
|
|
iter += sizeof(ReconvergenceStackEntry);
|
|
if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
|
|
pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
|
|
newEntry.execMask);
|
|
}
|
|
}
|
|
int wf_size = computeUnit->wfSize();
|
|
|
|
for (int i = 0; i < maxSpVgprs; i++) {
|
|
uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
|
|
computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < maxDpVgprs; i++) {
|
|
uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
|
|
computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < condRegState->numRegs(); i++) {
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
|
|
condRegState->write<uint64_t>(i, lane, regVal);
|
|
}
|
|
}
|
|
/** Restoring LDS contents */
|
|
if (ldsChunk)
|
|
for (int i = 0; i < ldsChunk->size(); i++) {
|
|
char val = *(char *) iter; iter += sizeof(val);
|
|
ldsChunk->write<char>(i, val);
|
|
}
|
|
}
|
|
|
|
void
|
|
Wavefront::computeActualWgSz(NDRange *ndr)
|
|
{
|
|
actualWgSzTotal = 1;
|
|
for (int d = 0; d < 3; ++d) {
|
|
actualWgSz[d] = std::min(workGroupSz[d],
|
|
gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
|
|
actualWgSzTotal *= actualWgSz[d];
|
|
}
|
|
}
|