gem5/src/gpu-compute/wavefront.cc
Tony Gutierrez d327cdba07 gpu-compute: add gpu_isa.hh to switch hdrs, add GPUISA to WF
the GPUISA class is meant to encapsulate any ISA-specific behavior - special
register accesses, isa-specific WF/kernel state, etc. - in a generic enough
way so that it may be used in ISA-agnostic code.

gpu-compute: use the GPUISA object to advance the PC

the GPU model treats the PC as a pointer to individual instruction objects -
which are store in a contiguous array - and not a byte address to be fetched
from the real memory system. this is ok for HSAIL because all instructions
are considered by the model to be the same size.

in machine ISA, however, instructions may be 32b or 64b, and branches are
calculated by advancing the PC by the number of words (4 byte chunks) it
needs to advance in the real instruction stream. because of this there is
a mismatch between the PC we use to index into the instruction array, and
the actual byte address PC the ISA expects. here we move the PC advance
calculation to the ISA so that differences in the instrucion sizes may be
accounted for in generic way.
2016-10-26 22:47:38 -04:00

991 lines
34 KiB
C++

/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Lisa Hsu
*/
#include "gpu-compute/wavefront.hh"
#include "debug/GPUExec.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
Wavefront*
WavefrontParams::create()
{
return new Wavefront(this);
}
Wavefront::Wavefront(const Params *p)
: SimObject(p), callArgMem(nullptr), _gpuISA(*this)
{
lastTrace = 0;
simdId = p->simdId;
wfSlotId = p->wf_slot_id;
status = S_STOPPED;
reservedVectorRegs = 0;
startVgprIndex = 0;
outstandingReqs = 0;
memReqsInPipe = 0;
outstandingReqsWrGm = 0;
outstandingReqsWrLm = 0;
outstandingReqsRdGm = 0;
outstandingReqsRdLm = 0;
rdLmReqsInPipe = 0;
rdGmReqsInPipe = 0;
wrLmReqsInPipe = 0;
wrGmReqsInPipe = 0;
barrierCnt = 0;
oldBarrierCnt = 0;
stalledAtBarrier = false;
memTraceBusy = 0;
oldVgprTcnt = 0xffffffffffffffffll;
oldDgprTcnt = 0xffffffffffffffffll;
oldVgpr.resize(p->wfSize);
pendingFetch = false;
dropFetch = false;
condRegState = new ConditionRegisterState();
maxSpVgprs = 0;
maxDpVgprs = 0;
lastAddr.resize(p->wfSize);
workItemFlatId.resize(p->wfSize);
oldDgpr.resize(p->wfSize);
barCnt.resize(p->wfSize);
for (int i = 0; i < 3; ++i) {
workItemId[i].resize(p->wfSize);
}
}
void
Wavefront::regStats()
{
SimObject::regStats();
srcRegOpDist
.init(0, 4, 2)
.name(name() + ".src_reg_operand_dist")
.desc("number of executed instructions with N source register operands")
;
dstRegOpDist
.init(0, 3, 2)
.name(name() + ".dst_reg_operand_dist")
.desc("number of executed instructions with N destination register "
"operands")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueWAXDependencies
.name(name() + ".timesBlockedDueWAXDependencies")
.desc("number of times the wf's instructions are blocked due to WAW "
"or WAR dependencies")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueRAWDependencies
.name(name() + ".timesBlockedDueRAWDependencies")
.desc("number of times the wf's instructions are blocked due to RAW "
"dependencies")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueVrfPortAvail
.name(name() + ".timesBlockedDueVrfPortAvail")
.desc("number of times instructions are blocked due to VRF port "
"availability")
;
}
void
Wavefront::init()
{
reservedVectorRegs = 0;
startVgprIndex = 0;
}
void
Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
{
condRegState->init(num_cregs);
maxSpVgprs = num_sregs;
maxDpVgprs = num_dregs;
}
Wavefront::~Wavefront()
{
if (callArgMem)
delete callArgMem;
delete condRegState;
}
void
Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
{
wfDynId = _wf_dyn_id;
basePtr = _base_ptr;
status = S_RUNNING;
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
if (ii->isGlobalMem() || ii->isFlat())
return true;
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
if (ii->isLocalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && (ii->isNop() ||
ii->isReturn() || ii->isBranch() ||
ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstBarrier()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isBarrier()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstGMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isGlobalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstLMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isLocalMem()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstPrivMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isPrivateSeg()) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstFlatMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->isFlat()) {
return true;
}
return false;
}
// Return true if the Wavefront's instruction
// buffer has branch instruction.
bool
Wavefront::instructionBufferHasBranch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
if (ii->isReturn() || ii->isBranch()) {
return true;
}
}
return false;
}
// Remap HSAIL register to physical VGPR.
// HSAIL register = virtual register assigned to an operand by HLC compiler
uint32_t
Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
{
assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
// add the offset from where the VGPRs of the wavefront have been assigned
uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
// HSAIL double precision (DP) register: calculate the physical VGPR index
// assuming that DP registers are placed after SP ones in the VRF. The DP
// and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
// the DP VGPR index before mapping it to the physical VRF address space
if (mode == 1 && size > 4) {
physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
}
assert((startVgprIndex <= physicalVgprIndex) &&
(startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
// calculate absolute physical VGPR index
return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
}
// Return true if this wavefront is ready
// to execute an instruction of the specified type.
int
Wavefront::ready(itype_e type)
{
// Check to make sure wave is running
if (status == S_STOPPED || status == S_RETURNING ||
instructionBuffer.empty()) {
return 0;
}
// Is the wave waiting at a barrier
if (stalledAtBarrier) {
if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
computeUnit->getRefCounter(dispatchId, wgId))) {
// Are all threads at barrier?
return 0;
}
oldBarrierCnt = barrierCnt;
stalledAtBarrier = false;
}
// Read instruction
GPUDynInstPtr ii = instructionBuffer.front();
bool ready_inst M5_VAR_USED = false;
bool glbMemBusRdy = false;
bool glbMemIssueRdy = false;
if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
glbMemBusRdy = true;
if (computeUnit->wfWait[j].prerdy())
glbMemIssueRdy = true;
}
}
bool locMemBusRdy = false;
bool locMemIssueRdy = false;
if (type == I_SHARED || type == I_FLAT) {
for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
locMemBusRdy = true;
if (computeUnit->wfWait[j].prerdy())
locMemIssueRdy = true;
}
}
// The following code is very error prone and the entire process for
// checking readiness will be fixed eventually. In the meantime, let's
// make sure that we do not silently let an instruction type slip
// through this logic and always return not ready.
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
ii->isMemFence() || ii->isFlat())) {
panic("next instruction: %s is of unknown type\n", ii->disassemble());
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
if (type == I_ALU && ii->isBarrier()) {
// Here for ALU instruction (barrier)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is wave slot free?
return 0;
}
// Are there in pipe or outstanding memory requests?
if ((outstandingReqs + memReqsInPipe) > 0) {
return 0;
}
ready_inst = true;
} else if (type == I_ALU && ii->isNop()) {
// Here for ALU instruction (nop)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is wave slot free?
return 0;
}
ready_inst = true;
} else if (type == I_ALU && ii->isReturn()) {
// Here for ALU instruction (return)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is wave slot free?
return 0;
}
// Are there in pipe or outstanding memory requests?
if ((outstandingReqs + memReqsInPipe) > 0) {
return 0;
}
ready_inst = true;
} else if (type == I_ALU && (ii->isBranch() ||
ii->isALU() ||
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg())) {
// Here for ALU instruction (all others)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is alu slot free?
return 0;
}
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_GLOBAL && ii->isGlobalMem()) {
// Here Global memory instruction
if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
// Are there in pipe or outstanding global memory write requests?
if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
return 0;
}
}
if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
// Are there in pipe or outstanding global memory read requests?
if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
return 0;
}
if (!glbMemIssueRdy) {
// Is WV issue slot free?
return 0;
}
if (!glbMemBusRdy) {
// Is there an available VRF->Global memory read bus?
return 0;
}
if (!computeUnit->globalMemoryPipe.
isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
// Can we insert a new request to the Global Mem Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_SHARED && ii->isLocalMem()) {
// Here for Shared memory instruction
if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
return 0;
}
}
if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
return 0;
}
}
if (!locMemBusRdy) {
// Is there an available VRF->LDS read bus?
return 0;
}
if (!locMemIssueRdy) {
// Is wave slot free?
return 0;
}
if (!computeUnit->localMemoryPipe.
isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
// Can we insert a new request to the LDS Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_FLAT && ii->isFlat()) {
if (!glbMemBusRdy) {
// Is there an available VRF->Global memory read bus?
return 0;
}
if (!locMemBusRdy) {
// Is there an available VRF->LDS read bus?
return 0;
}
if (!glbMemIssueRdy) {
// Is wave slot free?
return 0;
}
if (!locMemIssueRdy) {
return 0;
}
if (!computeUnit->globalMemoryPipe.
isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
// Can we insert a new request to the Global Mem Request FIFO?
return 0;
}
if (!computeUnit->localMemoryPipe.
isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
// Can we insert a new request to the LDS Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
// are all the operands ready? (RAW, WAW and WAR depedencies met?)
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else {
return 0;
}
assert(ready_inst);
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
simdId, wfSlotId, ii->disassemble());
return 1;
}
void
Wavefront::updateResources()
{
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
computeUnit->vrf[simdId]->updateResources(this, ii);
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() ||
// FIXME: Kernel argument loads are currently treated as ALU operations
// since we don't send memory packets at execution. If we fix that then
// we should map them to one of the memory pipelines
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
ii->isReturn()) {
computeUnit->aluPipe[simdId].preset(computeUnit->shader->
ticks(computeUnit->spBypassLength()));
// this is to enforce a fixed number of cycles per issue slot per SIMD
computeUnit->wfWait[simdId].preset(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->isBarrier()) {
computeUnit->wfWait[simdId].preset(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->isLoad() && ii->isFlat()) {
assert(Enums::SC_NONE != ii->executedAs());
memReqsInPipe++;
rdGmReqsInPipe++;
if ( Enums::SC_SHARED == ii->executedAs() ) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (ii->isStore() && ii->isFlat()) {
assert(Enums::SC_NONE != ii->executedAs());
memReqsInPipe++;
wrGmReqsInPipe++;
if (Enums::SC_SHARED == ii->executedAs()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (ii->isLoad() && ii->isGlobalMem()) {
memReqsInPipe++;
rdGmReqsInPipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (ii->isStore() && ii->isGlobalMem()) {
memReqsInPipe++;
wrGmReqsInPipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
memReqsInPipe++;
wrGmReqsInPipe++;
rdGmReqsInPipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (ii->isLoad() && ii->isLocalMem()) {
memReqsInPipe++;
rdLmReqsInPipe++;
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (ii->isStore() && ii->isLocalMem()) {
memReqsInPipe++;
wrLmReqsInPipe++;
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
memReqsInPipe++;
wrLmReqsInPipe++;
rdLmReqsInPipe++;
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
}
void
Wavefront::exec()
{
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
instructionBuffer.empty()) {
return;
}
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
const uint32_t old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble(), old_pc);
// update the instruction stats in the CU
ii->execute(ii);
computeUnit->updateInstStats(ii);
// access the VRF
computeUnit->vrf[simdId]->exec(ii, this);
srcRegOpDist.sample(ii->numSrcRegOperands());
dstRegOpDist.sample(ii->numDstRegOperands());
computeUnit->numInstrExecuted++;
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
if (pc() == old_pc) {
uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
// PC not modified by instruction, proceed to next or pop frame
pc(new_pc);
if (new_pc == rpc()) {
popFromReconvergenceStack();
discardFetch();
} else {
instructionBuffer.pop_front();
}
} else {
discardFetch();
}
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->numVecOpsExecuted += num_active_lanes;
if (isGmInstruction(ii)) {
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
}
}
// ---- Update Vector ALU pipeline and other resources ------------------ //
// Single precision ALU or Branch or Return or Special instruction
if (ii->isALU() || ii->isSpecialOp() ||
ii->isBranch() ||
// FIXME: Kernel argument loads are currently treated as ALU operations
// since we don't send memory packets at execution. If we fix that then
// we should map them to one of the memory pipelines
(ii->isKernArgSeg() && ii->isLoad()) ||
ii->isArgSeg() ||
ii->isReturn()) {
computeUnit->aluPipe[simdId].set(computeUnit->shader->
ticks(computeUnit->spBypassLength()));
// this is to enforce a fixed number of cycles per issue slot per SIMD
computeUnit->wfWait[simdId].set(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->isBarrier()) {
computeUnit->wfWait[simdId].set(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->isLoad() && ii->isFlat()) {
assert(Enums::SC_NONE != ii->executedAs());
if (Enums::SC_SHARED == ii->executedAs()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (ii->isStore() && ii->isFlat()) {
assert(Enums::SC_NONE != ii->executedAs());
if (Enums::SC_SHARED == ii->executedAs()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (ii->isLoad() && ii->isGlobalMem()) {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (ii->isStore() && ii->isGlobalMem()) {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (ii->isLoad() && ii->isLocalMem()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (ii->isStore() && ii->isLocalMem()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
}
bool
Wavefront::waitingAtBarrier(int lane)
{
return barCnt[lane] < maxBarCnt;
}
void
Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
const VectorMask& mask)
{
assert(mask.count());
reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
}
void
Wavefront::popFromReconvergenceStack()
{
assert(!reconvergenceStack.empty());
DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
execMask().to_string<char, std::string::traits_type,
std::string::allocator_type>().c_str(), pc());
reconvergenceStack.pop_back();
DPRINTF(WavefrontStack, "%3i %s\n", pc(),
execMask().to_string<char, std::string::traits_type,
std::string::allocator_type>().c_str());
}
void
Wavefront::discardFetch()
{
instructionBuffer.clear();
dropFetch |=pendingFetch;
}
uint32_t
Wavefront::pc() const
{
return reconvergenceStack.back()->pc;
}
uint32_t
Wavefront::rpc() const
{
return reconvergenceStack.back()->rpc;
}
VectorMask
Wavefront::execMask() const
{
return reconvergenceStack.back()->execMask;
}
bool
Wavefront::execMask(int lane) const
{
return reconvergenceStack.back()->execMask[lane];
}
void
Wavefront::pc(uint32_t new_pc)
{
reconvergenceStack.back()->pc = new_pc;
}
uint32_t
Wavefront::getStaticContextSize() const
{
return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
}
void
Wavefront::getContext(const void *out)
{
uint8_t *iter = (uint8_t *)out;
for (int i = 0; i < barCnt.size(); i++) {
*(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
}
*(int *)iter = wfId; iter += sizeof(wfId);
*(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
*(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
*(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
*(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
*(uint32_t *)iter = wgId; iter += sizeof(wgId);
*(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
*(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
*(Addr *)iter = privBase; iter += sizeof(privBase);
*(Addr *)iter = spillBase; iter += sizeof(spillBase);
int stackSize = reconvergenceStack.size();
ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
std::numeric_limits<uint32_t>::max(),
std::numeric_limits<uint64_t>::max()};
for (int i = 0; i < workItemId[0].size(); i++) {
if (i < stackSize) {
*(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
iter += sizeof(ReconvergenceStackEntry);
reconvergenceStack.pop_back();
} else {
*(ReconvergenceStackEntry *)iter = empty;
iter += sizeof(ReconvergenceStackEntry);
}
}
int wf_size = computeUnit->wfSize();
for (int i = 0; i < maxSpVgprs; i++) {
uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
for (int lane = 0; lane < wf_size; lane++) {
uint32_t regVal = computeUnit->vrf[simdId]->
read<uint32_t>(vgprIdx,lane);
*(uint32_t *)iter = regVal; iter += sizeof(regVal);
}
}
for (int i = 0; i < maxDpVgprs; i++) {
uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
for (int lane = 0; lane < wf_size; lane++) {
uint64_t regVal = computeUnit->vrf[simdId]->
read<uint64_t>(vgprIdx,lane);
*(uint64_t *)iter = regVal; iter += sizeof(regVal);
}
}
for (int i = 0; i < condRegState->numRegs(); i++) {
for (int lane = 0; lane < wf_size; lane++) {
uint64_t regVal = condRegState->read<uint64_t>(i, lane);
*(uint64_t *)iter = regVal; iter += sizeof(regVal);
}
}
/* saving LDS content */
if (ldsChunk)
for (int i = 0; i < ldsChunk->size(); i++) {
char val = ldsChunk->read<char>(i);
*(char *) iter = val; iter += sizeof(val);
}
}
void
Wavefront::setContext(const void *in)
{
uint8_t *iter = (uint8_t *)in;
for (int i = 0; i < barCnt.size(); i++) {
barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
}
wfId = *(int *)iter; iter += sizeof(wfId);
maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
wgId = *(uint32_t *)iter; iter += sizeof(wgId);
barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
privBase = *(Addr *)iter; iter += sizeof(privBase);
spillBase = *(Addr *)iter; iter += sizeof(spillBase);
for (int i = 0; i < workItemId[0].size(); i++) {
ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
iter += sizeof(ReconvergenceStackEntry);
if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
newEntry.execMask);
}
}
int wf_size = computeUnit->wfSize();
for (int i = 0; i < maxSpVgprs; i++) {
uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
for (int lane = 0; lane < wf_size; lane++) {
uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
}
}
for (int i = 0; i < maxDpVgprs; i++) {
uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
for (int lane = 0; lane < wf_size; lane++) {
uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
}
}
for (int i = 0; i < condRegState->numRegs(); i++) {
for (int lane = 0; lane < wf_size; lane++) {
uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
condRegState->write<uint64_t>(i, lane, regVal);
}
}
/** Restoring LDS contents */
if (ldsChunk)
for (int i = 0; i < ldsChunk->size(); i++) {
char val = *(char *) iter; iter += sizeof(val);
ldsChunk->write<char>(i, val);
}
}
void
Wavefront::computeActualWgSz(NDRange *ndr)
{
actualWgSzTotal = 1;
for (int d = 0; d < 3; ++d) {
actualWgSz[d] = std::min(workGroupSz[d],
gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
actualWgSzTotal *= actualWgSz[d];
}
}