gem5/src/gpu-compute/wavefront.cc

926 lines
32 KiB
C++
Raw Normal View History

2016-01-19 20:28:22 +01:00
/*
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
* All rights reserved.
*
* For use for simulation and test purposes only
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Author: Lisa Hsu
*/
#include "gpu-compute/wavefront.hh"
#include "debug/GPUExec.hh"
#include "debug/WavefrontStack.hh"
#include "gpu-compute/code_enums.hh"
#include "gpu-compute/compute_unit.hh"
#include "gpu-compute/gpu_dyn_inst.hh"
#include "gpu-compute/shader.hh"
#include "gpu-compute/vector_register_file.hh"
Wavefront*
WavefrontParams::create()
{
return new Wavefront(this);
}
Wavefront::Wavefront(const Params *p)
: SimObject(p), callArgMem(nullptr)
{
last_trace = 0;
simdId = p->simdId;
wfSlotId = p->wf_slot_id;
status = S_STOPPED;
reservedVectorRegs = 0;
startVgprIndex = 0;
outstanding_reqs = 0;
mem_reqs_in_pipe = 0;
outstanding_reqs_wr_gm = 0;
outstanding_reqs_wr_lm = 0;
outstanding_reqs_rd_gm = 0;
outstanding_reqs_rd_lm = 0;
rd_lm_reqs_in_pipe = 0;
rd_gm_reqs_in_pipe = 0;
wr_lm_reqs_in_pipe = 0;
wr_gm_reqs_in_pipe = 0;
barrier_cnt = 0;
old_barrier_cnt = 0;
stalledAtBarrier = false;
mem_trace_busy = 0;
old_vgpr_tcnt = 0xffffffffffffffffll;
old_dgpr_tcnt = 0xffffffffffffffffll;
pendingFetch = false;
dropFetch = false;
condRegState = new ConditionRegisterState();
maxSpVgprs = 0;
maxDpVgprs = 0;
}
void
Wavefront::regStats()
{
srcRegOpDist
.init(0, 4, 2)
.name(name() + ".src_reg_operand_dist")
.desc("number of executed instructions with N source register operands")
;
dstRegOpDist
.init(0, 3, 2)
.name(name() + ".dst_reg_operand_dist")
.desc("number of executed instructions with N destination register "
"operands")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueWAXDependencies
.name(name() + ".timesBlockedDueWAXDependencies")
.desc("number of times the wf's instructions are blocked due to WAW "
"or WAR dependencies")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueRAWDependencies
.name(name() + ".timesBlockedDueRAWDependencies")
.desc("number of times the wf's instructions are blocked due to RAW "
"dependencies")
;
// FIXME: the name of the WF needs to be unique
numTimesBlockedDueVrfPortAvail
.name(name() + ".timesBlockedDueVrfPortAvail")
.desc("number of times instructions are blocked due to VRF port "
"availability")
;
}
void
Wavefront::init()
{
reservedVectorRegs = 0;
startVgprIndex = 0;
}
void
Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
{
condRegState->init(num_cregs);
maxSpVgprs = num_sregs;
maxDpVgprs = num_dregs;
}
Wavefront::~Wavefront()
{
if (callArgMem)
delete callArgMem;
}
void
Wavefront::start(uint64_t _wfDynId,uint64_t _base_ptr)
{
wfDynId = _wfDynId;
base_ptr = _base_ptr;
status = S_RUNNING;
}
bool
Wavefront::isGmInstruction(GPUDynInstPtr ii)
{
if (IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
IS_OT_ATOMIC_PM(ii->opType())) {
return true;
}
if (IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
IS_OT_ATOMIC_GM(ii->opType())) {
return true;
}
if (IS_OT_FLAT(ii->opType())) {
return true;
}
return false;
}
bool
Wavefront::isLmInstruction(GPUDynInstPtr ii)
{
if (IS_OT_READ_LM(ii->opType()) || IS_OT_WRITE_LM(ii->opType()) ||
IS_OT_ATOMIC_LM(ii->opType())) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstALU()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && (ii->opType() == Enums::OT_NOP ||
ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
ii->opType() == Enums::OT_KERN_READ)) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstBarrier()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && ii->opType() == Enums::OT_BARRIER) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstGMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && (IS_OT_READ_GM(ii->opType()) ||
IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstLMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && (IS_OT_READ_LM(ii->opType()) ||
IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstPrivMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && (IS_OT_READ_PM(ii->opType()) ||
IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
return true;
}
return false;
}
bool
Wavefront::isOldestInstFlatMem()
{
assert(!instructionBuffer.empty());
GPUDynInstPtr ii = instructionBuffer.front();
if (status != S_STOPPED && IS_OT_FLAT(ii->opType())) {
return true;
}
return false;
}
// Return true if the Wavefront's instruction
// buffer has branch instruction.
bool
Wavefront::instructionBufferHasBranch()
{
for (auto it : instructionBuffer) {
GPUDynInstPtr ii = it;
if (ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH) {
return true;
}
}
return false;
}
// Remap HSAIL register to physical VGPR.
// HSAIL register = virtual register assigned to an operand by HLC compiler
uint32_t
Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
{
assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
// add the offset from where the VGPRs of the wavefront have been assigned
uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
// HSAIL double precision (DP) register: calculate the physical VGPR index
// assuming that DP registers are placed after SP ones in the VRF. The DP
// and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
// the DP VGPR index before mapping it to the physical VRF address space
if (mode == 1 && size > 4) {
physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
}
assert((startVgprIndex <= physicalVgprIndex) &&
(startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
// calculate absolute physical VGPR index
return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
}
// Return true if this wavefront is ready
// to execute an instruction of the specified type.
int
Wavefront::ready(itype_e type)
{
// Check to make sure wave is running
if (status == S_STOPPED || status == S_RETURNING ||
instructionBuffer.empty()) {
return 0;
}
// Is the wave waiting at a barrier
if (stalledAtBarrier) {
if (!computeUnit->AllAtBarrier(barrier_id,barrier_cnt,
computeUnit->getRefCounter(dispatchid, wg_id))) {
// Are all threads at barrier?
return 0;
}
old_barrier_cnt = barrier_cnt;
stalledAtBarrier = false;
}
// Read instruction
GPUDynInstPtr ii = instructionBuffer.front();
bool ready_inst M5_VAR_USED = false;
bool glbMemBusRdy = false;
bool glbMemIssueRdy = false;
if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
glbMemBusRdy = true;
if (computeUnit->wfWait[j].prerdy())
glbMemIssueRdy = true;
}
}
bool locMemBusRdy = false;
bool locMemIssueRdy = false;
if (type == I_SHARED) {
for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
locMemBusRdy = true;
if (computeUnit->wfWait[j].prerdy())
locMemIssueRdy = true;
}
}
// The following code is very error prone and the entire process for
// checking readiness will be fixed eventually. In the meantime, let's
// make sure that we do not silently let an instruction type slip
// through this logic and always return not ready.
if (!(ii->opType() == Enums::OT_BARRIER || ii->opType() == Enums::OT_NOP ||
ii->opType() == Enums::OT_RET || ii->opType() == Enums::OT_BRANCH ||
ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
ii->opType() == Enums::OT_KERN_READ ||
ii->opType() == Enums::OT_ARG ||
IS_OT_READ_GM(ii->opType()) || IS_OT_WRITE_GM(ii->opType()) ||
IS_OT_ATOMIC_GM(ii->opType()) || IS_OT_READ_LM(ii->opType()) ||
IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
IS_OT_READ_PM(ii->opType()) || IS_OT_WRITE_PM(ii->opType()) ||
IS_OT_ATOMIC_PM(ii->opType()) || IS_OT_FLAT(ii->opType()))) {
panic("next instruction: %s is of unknown type\n", ii->disassemble());
}
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
if (type == I_ALU && ii->opType() == Enums::OT_BARRIER) {
// Here for ALU instruction (barrier)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is wave slot free?
return 0;
}
// Are there in pipe or outstanding memory requests?
if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
return 0;
}
ready_inst = true;
} else if (type == I_ALU && ii->opType() == Enums::OT_NOP) {
// Here for ALU instruction (nop)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is wave slot free?
return 0;
}
ready_inst = true;
} else if (type == I_ALU && ii->opType() == Enums::OT_RET) {
// Here for ALU instruction (return)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is wave slot free?
return 0;
}
// Are there in pipe or outstanding memory requests?
if ((outstanding_reqs + mem_reqs_in_pipe) > 0) {
return 0;
}
ready_inst = true;
} else if (type == I_ALU && (ii->opType() == Enums::OT_BRANCH ||
ii->opType() == Enums::OT_ALU || IS_OT_LDAS(ii->opType()) ||
ii->opType() == Enums::OT_KERN_READ ||
ii->opType() == Enums::OT_ARG)) {
// Here for ALU instruction (all others)
if (!computeUnit->wfWait[simdId].prerdy()) {
// Is alu slot free?
return 0;
}
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_GLOBAL && (IS_OT_READ_GM(ii->opType()) ||
IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()))) {
// Here Global memory instruction
if (IS_OT_READ_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType())) {
// Are there in pipe or outstanding global memory write requests?
if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
return 0;
}
}
if (IS_OT_WRITE_GM(ii->opType()) || IS_OT_ATOMIC_GM(ii->opType()) ||
IS_OT_HIST_GM(ii->opType())) {
// Are there in pipe or outstanding global memory read requests?
if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0)
return 0;
}
if (!glbMemIssueRdy) {
// Is WV issue slot free?
return 0;
}
if (!glbMemBusRdy) {
// Is there an available VRF->Global memory read bus?
return 0;
}
if (!computeUnit->globalMemoryPipe.
isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
// Can we insert a new request to the Global Mem Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_SHARED && (IS_OT_READ_LM(ii->opType()) ||
IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()))) {
// Here for Shared memory instruction
if (IS_OT_READ_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType())) {
if ((outstanding_reqs_wr_lm + wr_lm_reqs_in_pipe) > 0) {
return 0;
}
}
if (IS_OT_WRITE_LM(ii->opType()) || IS_OT_ATOMIC_LM(ii->opType()) ||
IS_OT_HIST_LM(ii->opType())) {
if ((outstanding_reqs_rd_lm + rd_lm_reqs_in_pipe) > 0) {
return 0;
}
}
if (!locMemBusRdy) {
// Is there an available VRF->LDS read bus?
return 0;
}
if (!locMemIssueRdy) {
// Is wave slot free?
return 0;
}
if (!computeUnit->localMemoryPipe.
isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
// Can we insert a new request to the LDS Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_PRIVATE && (IS_OT_READ_PM(ii->opType()) ||
IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()))) {
// Here for Private memory instruction ------------------------ //
if (IS_OT_READ_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType())) {
if ((outstanding_reqs_wr_gm + wr_gm_reqs_in_pipe) > 0) {
return 0;
}
}
if (IS_OT_WRITE_PM(ii->opType()) || IS_OT_ATOMIC_PM(ii->opType()) ||
IS_OT_HIST_PM(ii->opType())) {
if ((outstanding_reqs_rd_gm + rd_gm_reqs_in_pipe) > 0) {
return 0;
}
}
if (!glbMemBusRdy) {
// Is there an available VRF->Global memory read bus?
return 0;
}
if (!glbMemIssueRdy) {
// Is wave slot free?
return 0;
}
if (!computeUnit->globalMemoryPipe.
isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
// Can we insert a new request to the Global Mem Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else if (type == I_FLAT && IS_OT_FLAT(ii->opType())) {
if (!glbMemBusRdy) {
// Is there an available VRF->Global memory read bus?
return 0;
}
if (!locMemBusRdy) {
// Is there an available VRF->LDS read bus?
return 0;
}
if (!glbMemIssueRdy) {
// Is wave slot free?
return 0;
}
if (!locMemIssueRdy) {
return 0;
}
if (!computeUnit->globalMemoryPipe.
isGMReqFIFOWrRdy(rd_gm_reqs_in_pipe + wr_gm_reqs_in_pipe)) {
// Can we insert a new request to the Global Mem Request FIFO?
return 0;
}
if (!computeUnit->localMemoryPipe.
isLMReqFIFOWrRdy(rd_lm_reqs_in_pipe + wr_lm_reqs_in_pipe)) {
// Can we insert a new request to the LDS Request FIFO?
return 0;
}
// can we schedule source & destination operands on the VRF?
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
VrfAccessType::RD_WR)) {
return 0;
}
// are all the operands ready? (RAW, WAW and WAR depedencies met?)
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
return 0;
}
ready_inst = true;
} else {
return 0;
}
assert(ready_inst);
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
simdId, wfSlotId, ii->disassemble());
return 1;
}
void
Wavefront::updateResources()
{
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
assert(ii);
computeUnit->vrf[simdId]->updateResources(this, ii);
// Single precision ALU or Branch or Return or Special instruction
if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
// FIXME: Kernel argument loads are currently treated as ALU operations
// since we don't send memory packets at execution. If we fix that then
// we should map them to one of the memory pipelines
ii->opType()==Enums::OT_KERN_READ ||
ii->opType()==Enums::OT_ARG ||
ii->opType()==Enums::OT_RET) {
computeUnit->aluPipe[simdId].preset(computeUnit->shader->
ticks(computeUnit->spBypassLength()));
// this is to enforce a fixed number of cycles per issue slot per SIMD
computeUnit->wfWait[simdId].preset(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->opType() == Enums::OT_BARRIER) {
computeUnit->wfWait[simdId].preset(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->opType() == Enums::OT_FLAT_READ) {
assert(Enums::SC_NONE != ii->executedAs());
mem_reqs_in_pipe++;
rd_gm_reqs_in_pipe++;
if ( Enums::SC_SHARED == ii->executedAs() ) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (ii->opType() == Enums::OT_FLAT_WRITE) {
assert(Enums::SC_NONE != ii->executedAs());
mem_reqs_in_pipe++;
wr_gm_reqs_in_pipe++;
if (Enums::SC_SHARED == ii->executedAs()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (IS_OT_READ_GM(ii->opType())) {
mem_reqs_in_pipe++;
rd_gm_reqs_in_pipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_WRITE_GM(ii->opType())) {
mem_reqs_in_pipe++;
wr_gm_reqs_in_pipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_ATOMIC_GM(ii->opType())) {
mem_reqs_in_pipe++;
wr_gm_reqs_in_pipe++;
rd_gm_reqs_in_pipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_READ_LM(ii->opType())) {
mem_reqs_in_pipe++;
rd_lm_reqs_in_pipe++;
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_WRITE_LM(ii->opType())) {
mem_reqs_in_pipe++;
wr_lm_reqs_in_pipe++;
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_ATOMIC_LM(ii->opType())) {
mem_reqs_in_pipe++;
wr_lm_reqs_in_pipe++;
rd_lm_reqs_in_pipe++;
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_READ_PM(ii->opType())) {
mem_reqs_in_pipe++;
rd_gm_reqs_in_pipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_WRITE_PM(ii->opType())) {
mem_reqs_in_pipe++;
wr_gm_reqs_in_pipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_ATOMIC_PM(ii->opType())) {
mem_reqs_in_pipe++;
wr_gm_reqs_in_pipe++;
rd_gm_reqs_in_pipe++;
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
preset(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
}
void
Wavefront::exec()
{
// ---- Exit if wavefront is inactive ----------------------------- //
if (status == S_STOPPED || status == S_RETURNING ||
instructionBuffer.empty()) {
return;
}
// Get current instruction
GPUDynInstPtr ii = instructionBuffer.front();
const uint32_t old_pc = pc();
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
ii->disassemble(), old_pc);
ii->execute();
// access the VRF
computeUnit->vrf[simdId]->exec(ii, this);
srcRegOpDist.sample(ii->numSrcRegOperands());
dstRegOpDist.sample(ii->numDstRegOperands());
computeUnit->numInstrExecuted++;
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
computeUnit->lastExecCycle[simdId]);
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
if (pc() == old_pc) {
uint32_t new_pc = old_pc + 1;
// PC not modified by instruction, proceed to next or pop frame
pc(new_pc);
if (new_pc == rpc()) {
popFromReconvergenceStack();
discardFetch();
} else {
instructionBuffer.pop_front();
}
}
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
const int num_active_lanes = execMask().count();
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
computeUnit->numVecOpsExecuted += num_active_lanes;
if (isGmInstruction(ii)) {
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
} else if (isLmInstruction(ii)) {
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
}
}
// ---- Update Vector ALU pipeline and other resources ------------------ //
// Single precision ALU or Branch or Return or Special instruction
if (ii->opType() == Enums::OT_ALU || ii->opType() == Enums::OT_SPECIAL ||
ii->opType() == Enums::OT_BRANCH || IS_OT_LDAS(ii->opType()) ||
// FIXME: Kernel argument loads are currently treated as ALU operations
// since we don't send memory packets at execution. If we fix that then
// we should map them to one of the memory pipelines
ii->opType() == Enums::OT_KERN_READ ||
ii->opType() == Enums::OT_ARG ||
ii->opType() == Enums::OT_RET) {
computeUnit->aluPipe[simdId].set(computeUnit->shader->
ticks(computeUnit->spBypassLength()));
// this is to enforce a fixed number of cycles per issue slot per SIMD
computeUnit->wfWait[simdId].set(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->opType() == Enums::OT_BARRIER) {
computeUnit->wfWait[simdId].set(computeUnit->shader->
ticks(computeUnit->issuePeriod));
} else if (ii->opType() == Enums::OT_FLAT_READ) {
assert(Enums::SC_NONE != ii->executedAs());
if (Enums::SC_SHARED == ii->executedAs()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (ii->opType() == Enums::OT_FLAT_WRITE) {
assert(Enums::SC_NONE != ii->executedAs());
if (Enums::SC_SHARED == ii->executedAs()) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
} else if (IS_OT_READ_GM(ii->opType())) {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_WRITE_GM(ii->opType())) {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_ATOMIC_GM(ii->opType())) {
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_READ_LM(ii->opType())) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(4));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_WRITE_LM(ii->opType())) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
} else if (IS_OT_ATOMIC_LM(ii->opType())) {
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
set(computeUnit->shader->ticks(8));
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
}
}
bool
Wavefront::waitingAtBarrier(int lane)
{
return bar_cnt[lane] < max_bar_cnt;
}
void
Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
const VectorMask& mask)
{
assert(mask.count());
reconvergenceStack.emplace(new ReconvergenceStackEntry(pc, rpc, mask));
}
void
Wavefront::popFromReconvergenceStack()
{
assert(!reconvergenceStack.empty());
DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
execMask().to_string<char, std::string::traits_type,
std::string::allocator_type>().c_str(), pc());
reconvergenceStack.pop();
DPRINTF(WavefrontStack, "%3i %s\n", pc(),
execMask().to_string<char, std::string::traits_type,
std::string::allocator_type>().c_str());
}
void
Wavefront::discardFetch()
{
instructionBuffer.clear();
dropFetch |=pendingFetch;
}
uint32_t
Wavefront::pc() const
{
return reconvergenceStack.top()->pc;
}
uint32_t
Wavefront::rpc() const
{
return reconvergenceStack.top()->rpc;
}
VectorMask
Wavefront::execMask() const
{
return reconvergenceStack.top()->execMask;
}
bool
Wavefront::execMask(int lane) const
{
return reconvergenceStack.top()->execMask[lane];
}
void
Wavefront::pc(uint32_t new_pc)
{
reconvergenceStack.top()->pc = new_pc;
}