2016-01-19 20:28:22 +01:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* For use for simulation and test purposes only
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer.
|
|
|
|
*
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* Author: Lisa Hsu
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "gpu-compute/wavefront.hh"
|
|
|
|
|
|
|
|
#include "debug/GPUExec.hh"
|
|
|
|
#include "debug/WavefrontStack.hh"
|
|
|
|
#include "gpu-compute/compute_unit.hh"
|
|
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
|
|
#include "gpu-compute/shader.hh"
|
|
|
|
#include "gpu-compute/vector_register_file.hh"
|
|
|
|
|
|
|
|
Wavefront*
|
|
|
|
WavefrontParams::create()
|
|
|
|
{
|
|
|
|
return new Wavefront(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
Wavefront::Wavefront(const Params *p)
|
2016-10-27 04:47:38 +02:00
|
|
|
: SimObject(p), callArgMem(nullptr), _gpuISA(*this)
|
2016-01-19 20:28:22 +01:00
|
|
|
{
|
2016-09-16 18:26:52 +02:00
|
|
|
lastTrace = 0;
|
2016-01-19 20:28:22 +01:00
|
|
|
simdId = p->simdId;
|
|
|
|
wfSlotId = p->wf_slot_id;
|
|
|
|
status = S_STOPPED;
|
|
|
|
reservedVectorRegs = 0;
|
|
|
|
startVgprIndex = 0;
|
2016-09-16 18:26:52 +02:00
|
|
|
outstandingReqs = 0;
|
|
|
|
memReqsInPipe = 0;
|
|
|
|
outstandingReqsWrGm = 0;
|
|
|
|
outstandingReqsWrLm = 0;
|
|
|
|
outstandingReqsRdGm = 0;
|
|
|
|
outstandingReqsRdLm = 0;
|
|
|
|
rdLmReqsInPipe = 0;
|
|
|
|
rdGmReqsInPipe = 0;
|
|
|
|
wrLmReqsInPipe = 0;
|
|
|
|
wrGmReqsInPipe = 0;
|
|
|
|
|
|
|
|
barrierCnt = 0;
|
|
|
|
oldBarrierCnt = 0;
|
2016-01-19 20:28:22 +01:00
|
|
|
stalledAtBarrier = false;
|
|
|
|
|
2016-09-16 18:26:52 +02:00
|
|
|
memTraceBusy = 0;
|
|
|
|
oldVgprTcnt = 0xffffffffffffffffll;
|
|
|
|
oldDgprTcnt = 0xffffffffffffffffll;
|
|
|
|
oldVgpr.resize(p->wfSize);
|
2016-01-19 20:28:22 +01:00
|
|
|
|
|
|
|
pendingFetch = false;
|
|
|
|
dropFetch = false;
|
|
|
|
condRegState = new ConditionRegisterState();
|
|
|
|
maxSpVgprs = 0;
|
|
|
|
maxDpVgprs = 0;
|
2016-09-16 18:26:52 +02:00
|
|
|
lastAddr.resize(p->wfSize);
|
|
|
|
workItemFlatId.resize(p->wfSize);
|
|
|
|
oldDgpr.resize(p->wfSize);
|
|
|
|
barCnt.resize(p->wfSize);
|
2016-06-09 17:24:55 +02:00
|
|
|
for (int i = 0; i < 3; ++i) {
|
2016-09-16 18:26:52 +02:00
|
|
|
workItemId[i].resize(p->wfSize);
|
2016-06-09 17:24:55 +02:00
|
|
|
}
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::regStats()
|
|
|
|
{
|
2016-06-06 18:16:43 +02:00
|
|
|
SimObject::regStats();
|
|
|
|
|
2016-01-19 20:28:22 +01:00
|
|
|
srcRegOpDist
|
|
|
|
.init(0, 4, 2)
|
|
|
|
.name(name() + ".src_reg_operand_dist")
|
|
|
|
.desc("number of executed instructions with N source register operands")
|
|
|
|
;
|
|
|
|
|
|
|
|
dstRegOpDist
|
|
|
|
.init(0, 3, 2)
|
|
|
|
.name(name() + ".dst_reg_operand_dist")
|
|
|
|
.desc("number of executed instructions with N destination register "
|
|
|
|
"operands")
|
|
|
|
;
|
|
|
|
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
|
|
numTimesBlockedDueWAXDependencies
|
|
|
|
.name(name() + ".timesBlockedDueWAXDependencies")
|
|
|
|
.desc("number of times the wf's instructions are blocked due to WAW "
|
|
|
|
"or WAR dependencies")
|
|
|
|
;
|
|
|
|
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
|
|
numTimesBlockedDueRAWDependencies
|
|
|
|
.name(name() + ".timesBlockedDueRAWDependencies")
|
|
|
|
.desc("number of times the wf's instructions are blocked due to RAW "
|
|
|
|
"dependencies")
|
|
|
|
;
|
|
|
|
|
|
|
|
// FIXME: the name of the WF needs to be unique
|
|
|
|
numTimesBlockedDueVrfPortAvail
|
|
|
|
.name(name() + ".timesBlockedDueVrfPortAvail")
|
|
|
|
.desc("number of times instructions are blocked due to VRF port "
|
|
|
|
"availability")
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::init()
|
|
|
|
{
|
|
|
|
reservedVectorRegs = 0;
|
|
|
|
startVgprIndex = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::resizeRegFiles(int num_cregs, int num_sregs, int num_dregs)
|
|
|
|
{
|
|
|
|
condRegState->init(num_cregs);
|
|
|
|
maxSpVgprs = num_sregs;
|
|
|
|
maxDpVgprs = num_dregs;
|
|
|
|
}
|
|
|
|
|
|
|
|
Wavefront::~Wavefront()
|
|
|
|
{
|
|
|
|
if (callArgMem)
|
|
|
|
delete callArgMem;
|
2016-06-09 17:24:55 +02:00
|
|
|
delete condRegState;
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2016-09-16 18:27:56 +02:00
|
|
|
Wavefront::start(uint64_t _wf_dyn_id,uint64_t _base_ptr)
|
2016-01-19 20:28:22 +01:00
|
|
|
{
|
2016-09-16 18:27:56 +02:00
|
|
|
wfDynId = _wf_dyn_id;
|
2016-09-16 18:26:52 +02:00
|
|
|
basePtr = _base_ptr;
|
2016-01-19 20:28:22 +01:00
|
|
|
status = S_RUNNING;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isGmInstruction(GPUDynInstPtr ii)
|
|
|
|
{
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isGlobalMem() || ii->isFlat())
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isLmInstruction(GPUDynInstPtr ii)
|
|
|
|
{
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isLocalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isOldestInstALU()
|
|
|
|
{
|
|
|
|
assert(!instructionBuffer.empty());
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (status != S_STOPPED && (ii->isNop() ||
|
|
|
|
ii->isReturn() || ii->isBranch() ||
|
|
|
|
ii->isALU() || (ii->isKernArgSeg() && ii->isLoad()))) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isOldestInstBarrier()
|
|
|
|
{
|
|
|
|
assert(!instructionBuffer.empty());
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (status != S_STOPPED && ii->isBarrier()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isOldestInstGMem()
|
|
|
|
{
|
|
|
|
assert(!instructionBuffer.empty());
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (status != S_STOPPED && ii->isGlobalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isOldestInstLMem()
|
|
|
|
{
|
|
|
|
assert(!instructionBuffer.empty());
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (status != S_STOPPED && ii->isLocalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isOldestInstPrivMem()
|
|
|
|
{
|
|
|
|
assert(!instructionBuffer.empty());
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (status != S_STOPPED && ii->isPrivateSeg()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::isOldestInstFlatMem()
|
|
|
|
{
|
|
|
|
assert(!instructionBuffer.empty());
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (status != S_STOPPED && ii->isFlat()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return true if the Wavefront's instruction
|
|
|
|
// buffer has branch instruction.
|
|
|
|
bool
|
|
|
|
Wavefront::instructionBufferHasBranch()
|
|
|
|
{
|
|
|
|
for (auto it : instructionBuffer) {
|
|
|
|
GPUDynInstPtr ii = it;
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isReturn() || ii->isBranch()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remap HSAIL register to physical VGPR.
|
|
|
|
// HSAIL register = virtual register assigned to an operand by HLC compiler
|
|
|
|
uint32_t
|
|
|
|
Wavefront::remap(uint32_t vgprIndex, uint32_t size, uint8_t mode)
|
|
|
|
{
|
|
|
|
assert((vgprIndex < reservedVectorRegs) && (reservedVectorRegs > 0));
|
|
|
|
// add the offset from where the VGPRs of the wavefront have been assigned
|
|
|
|
uint32_t physicalVgprIndex = startVgprIndex + vgprIndex;
|
|
|
|
// HSAIL double precision (DP) register: calculate the physical VGPR index
|
|
|
|
// assuming that DP registers are placed after SP ones in the VRF. The DP
|
|
|
|
// and SP VGPR name spaces in HSAIL mode are separate so we need to adjust
|
|
|
|
// the DP VGPR index before mapping it to the physical VRF address space
|
|
|
|
if (mode == 1 && size > 4) {
|
|
|
|
physicalVgprIndex = startVgprIndex + maxSpVgprs + (2 * vgprIndex);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert((startVgprIndex <= physicalVgprIndex) &&
|
|
|
|
(startVgprIndex + reservedVectorRegs - 1) >= physicalVgprIndex);
|
|
|
|
|
|
|
|
// calculate absolute physical VGPR index
|
|
|
|
return physicalVgprIndex % computeUnit->vrf[simdId]->numRegs();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return true if this wavefront is ready
|
|
|
|
// to execute an instruction of the specified type.
|
|
|
|
int
|
|
|
|
Wavefront::ready(itype_e type)
|
|
|
|
{
|
|
|
|
// Check to make sure wave is running
|
|
|
|
if (status == S_STOPPED || status == S_RETURNING ||
|
|
|
|
instructionBuffer.empty()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Is the wave waiting at a barrier
|
|
|
|
if (stalledAtBarrier) {
|
2016-09-16 18:26:52 +02:00
|
|
|
if (!computeUnit->AllAtBarrier(barrierId,barrierCnt,
|
|
|
|
computeUnit->getRefCounter(dispatchId, wgId))) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Are all threads at barrier?
|
|
|
|
return 0;
|
|
|
|
}
|
2016-09-16 18:26:52 +02:00
|
|
|
oldBarrierCnt = barrierCnt;
|
2016-01-19 20:28:22 +01:00
|
|
|
stalledAtBarrier = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Read instruction
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
|
|
|
bool ready_inst M5_VAR_USED = false;
|
|
|
|
bool glbMemBusRdy = false;
|
|
|
|
bool glbMemIssueRdy = false;
|
|
|
|
if (type == I_GLOBAL || type == I_FLAT || type == I_PRIVATE) {
|
|
|
|
for (int j=0; j < computeUnit->numGlbMemUnits; ++j) {
|
|
|
|
if (computeUnit->vrfToGlobalMemPipeBus[j].prerdy())
|
|
|
|
glbMemBusRdy = true;
|
|
|
|
if (computeUnit->wfWait[j].prerdy())
|
|
|
|
glbMemIssueRdy = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bool locMemBusRdy = false;
|
|
|
|
bool locMemIssueRdy = false;
|
2016-02-18 16:42:03 +01:00
|
|
|
if (type == I_SHARED || type == I_FLAT) {
|
2016-01-19 20:28:22 +01:00
|
|
|
for (int j=0; j < computeUnit->numLocMemUnits; ++j) {
|
|
|
|
if (computeUnit->vrfToLocalMemPipeBus[j].prerdy())
|
|
|
|
locMemBusRdy = true;
|
|
|
|
if (computeUnit->wfWait[j].prerdy())
|
|
|
|
locMemIssueRdy = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The following code is very error prone and the entire process for
|
|
|
|
// checking readiness will be fixed eventually. In the meantime, let's
|
|
|
|
// make sure that we do not silently let an instruction type slip
|
|
|
|
// through this logic and always return not ready.
|
2016-10-27 04:47:11 +02:00
|
|
|
if (!(ii->isBarrier() || ii->isNop() || ii->isReturn() || ii->isBranch() ||
|
|
|
|
ii->isALU() || ii->isLoad() || ii->isStore() || ii->isAtomic() ||
|
|
|
|
ii->isMemFence() || ii->isFlat())) {
|
2016-01-19 20:28:22 +01:00
|
|
|
panic("next instruction: %s is of unknown type\n", ii->disassemble());
|
|
|
|
}
|
|
|
|
|
|
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Checking Read for Inst : %s\n",
|
|
|
|
computeUnit->cu_id, simdId, wfSlotId, ii->disassemble());
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (type == I_ALU && ii->isBarrier()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Here for ALU instruction (barrier)
|
|
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
|
|
// Is wave slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Are there in pipe or outstanding memory requests?
|
2016-09-16 18:26:52 +02:00
|
|
|
if ((outstandingReqs + memReqsInPipe) > 0) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ready_inst = true;
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (type == I_ALU && ii->isNop()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Here for ALU instruction (nop)
|
|
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
|
|
// Is wave slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ready_inst = true;
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (type == I_ALU && ii->isReturn()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Here for ALU instruction (return)
|
|
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
|
|
// Is wave slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Are there in pipe or outstanding memory requests?
|
2016-09-16 18:26:52 +02:00
|
|
|
if ((outstandingReqs + memReqsInPipe) > 0) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ready_inst = true;
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (type == I_ALU && (ii->isBranch() ||
|
|
|
|
ii->isALU() ||
|
|
|
|
(ii->isKernArgSeg() && ii->isLoad()) ||
|
|
|
|
ii->isArgSeg())) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Here for ALU instruction (all others)
|
|
|
|
if (!computeUnit->wfWait[simdId].prerdy()) {
|
|
|
|
// Is alu slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
|
|
VrfAccessType::RD_WR)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
ready_inst = true;
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (type == I_GLOBAL && ii->isGlobalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Here Global memory instruction
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Are there in pipe or outstanding global memory write requests?
|
2016-09-16 18:26:52 +02:00
|
|
|
if ((outstandingReqsWrGm + wrGmReqsInPipe) > 0) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Are there in pipe or outstanding global memory read requests?
|
2016-09-16 18:26:52 +02:00
|
|
|
if ((outstandingReqsRdGm + rdGmReqsInPipe) > 0)
|
2016-01-19 20:28:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!glbMemIssueRdy) {
|
|
|
|
// Is WV issue slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!glbMemBusRdy) {
|
|
|
|
// Is there an available VRF->Global memory read bus?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!computeUnit->globalMemoryPipe.
|
2016-09-16 18:26:52 +02:00
|
|
|
isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Can we insert a new request to the Global Mem Request FIFO?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// can we schedule source & destination operands on the VRF?
|
|
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
|
|
VrfAccessType::RD_WR)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
ready_inst = true;
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (type == I_SHARED && ii->isLocalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Here for Shared memory instruction
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isLoad() || ii->isAtomic() || ii->isMemFence()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
if ((outstandingReqsWrLm + wrLmReqsInPipe) > 0) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isStore() || ii->isAtomic() || ii->isMemFence()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
if ((outstandingReqsRdLm + rdLmReqsInPipe) > 0) {
|
2016-01-19 20:28:22 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!locMemBusRdy) {
|
|
|
|
// Is there an available VRF->LDS read bus?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (!locMemIssueRdy) {
|
|
|
|
// Is wave slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!computeUnit->localMemoryPipe.
|
2016-09-16 18:26:52 +02:00
|
|
|
isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Can we insert a new request to the LDS Request FIFO?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// can we schedule source & destination operands on the VRF?
|
|
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
|
|
VrfAccessType::RD_WR)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
ready_inst = true;
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (type == I_FLAT && ii->isFlat()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
if (!glbMemBusRdy) {
|
|
|
|
// Is there an available VRF->Global memory read bus?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!locMemBusRdy) {
|
|
|
|
// Is there an available VRF->LDS read bus?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!glbMemIssueRdy) {
|
|
|
|
// Is wave slot free?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!locMemIssueRdy) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (!computeUnit->globalMemoryPipe.
|
2016-09-16 18:26:52 +02:00
|
|
|
isGMReqFIFOWrRdy(rdGmReqsInPipe + wrGmReqsInPipe)) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Can we insert a new request to the Global Mem Request FIFO?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!computeUnit->localMemoryPipe.
|
2016-09-16 18:26:52 +02:00
|
|
|
isLMReqFIFOWrRdy(rdLmReqsInPipe + wrLmReqsInPipe)) {
|
2016-01-19 20:28:22 +01:00
|
|
|
// Can we insert a new request to the LDS Request FIFO?
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// can we schedule source & destination operands on the VRF?
|
|
|
|
if (!computeUnit->vrf[simdId]->vrfOperandAccessReady(this, ii,
|
|
|
|
VrfAccessType::RD_WR)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// are all the operands ready? (RAW, WAW and WAR depedencies met?)
|
|
|
|
if (!computeUnit->vrf[simdId]->operandsReady(this, ii)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
ready_inst = true;
|
|
|
|
} else {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(ready_inst);
|
|
|
|
|
|
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: Ready Inst : %s\n", computeUnit->cu_id,
|
|
|
|
simdId, wfSlotId, ii->disassemble());
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::updateResources()
|
|
|
|
{
|
|
|
|
// Get current instruction
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
assert(ii);
|
|
|
|
computeUnit->vrf[simdId]->updateResources(this, ii);
|
|
|
|
// Single precision ALU or Branch or Return or Special instruction
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
|
|
ii->isBranch() ||
|
2016-01-19 20:28:22 +01:00
|
|
|
// FIXME: Kernel argument loads are currently treated as ALU operations
|
|
|
|
// since we don't send memory packets at execution. If we fix that then
|
|
|
|
// we should map them to one of the memory pipelines
|
2016-10-27 04:47:11 +02:00
|
|
|
(ii->isKernArgSeg() && ii->isLoad()) || ii->isArgSeg() ||
|
|
|
|
ii->isReturn()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->aluPipe[simdId].preset(computeUnit->shader->
|
|
|
|
ticks(computeUnit->spBypassLength()));
|
|
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
|
|
computeUnit->wfWait[simdId].preset(computeUnit->shader->
|
|
|
|
ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isBarrier()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->wfWait[simdId].preset(computeUnit->shader->
|
|
|
|
ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isLoad() && ii->isFlat()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
rdGmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
if ( Enums::SC_SHARED == ii->executedAs() ) {
|
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
} else {
|
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
}
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isStore() && ii->isFlat()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
wrGmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
if (Enums::SC_SHARED == ii->executedAs()) {
|
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
} else {
|
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
}
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isLoad() && ii->isGlobalMem()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
rdGmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isStore() && ii->isGlobalMem()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
wrGmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
wrGmReqsInPipe++;
|
|
|
|
rdGmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isLoad() && ii->isLocalMem()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
rdLmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isStore() && ii->isLocalMem()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
wrLmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
|
2016-09-16 18:26:52 +02:00
|
|
|
memReqsInPipe++;
|
|
|
|
wrLmReqsInPipe++;
|
|
|
|
rdLmReqsInPipe++;
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
preset(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
preset(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::exec()
|
|
|
|
{
|
|
|
|
// ---- Exit if wavefront is inactive ----------------------------- //
|
|
|
|
|
|
|
|
if (status == S_STOPPED || status == S_RETURNING ||
|
|
|
|
instructionBuffer.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get current instruction
|
|
|
|
|
|
|
|
GPUDynInstPtr ii = instructionBuffer.front();
|
|
|
|
|
|
|
|
const uint32_t old_pc = pc();
|
|
|
|
DPRINTF(GPUExec, "CU%d: WF[%d][%d]: wave[%d] Executing inst: %s "
|
|
|
|
"(pc: %i)\n", computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
|
|
|
ii->disassemble(), old_pc);
|
2016-10-27 04:47:30 +02:00
|
|
|
|
|
|
|
// update the instruction stats in the CU
|
|
|
|
|
2016-10-27 04:47:11 +02:00
|
|
|
ii->execute(ii);
|
2016-10-27 04:47:30 +02:00
|
|
|
computeUnit->updateInstStats(ii);
|
2016-01-19 20:28:22 +01:00
|
|
|
// access the VRF
|
|
|
|
computeUnit->vrf[simdId]->exec(ii, this);
|
|
|
|
srcRegOpDist.sample(ii->numSrcRegOperands());
|
|
|
|
dstRegOpDist.sample(ii->numDstRegOperands());
|
|
|
|
computeUnit->numInstrExecuted++;
|
|
|
|
computeUnit->execRateDist.sample(computeUnit->totalCycles.value() -
|
|
|
|
computeUnit->lastExecCycle[simdId]);
|
|
|
|
computeUnit->lastExecCycle[simdId] = computeUnit->totalCycles.value();
|
|
|
|
if (pc() == old_pc) {
|
2016-10-27 04:47:38 +02:00
|
|
|
uint32_t new_pc = _gpuISA.advancePC(old_pc, ii);
|
2016-01-19 20:28:22 +01:00
|
|
|
// PC not modified by instruction, proceed to next or pop frame
|
|
|
|
pc(new_pc);
|
|
|
|
if (new_pc == rpc()) {
|
|
|
|
popFromReconvergenceStack();
|
|
|
|
discardFetch();
|
|
|
|
} else {
|
|
|
|
instructionBuffer.pop_front();
|
|
|
|
}
|
2016-10-27 04:47:27 +02:00
|
|
|
} else {
|
|
|
|
discardFetch();
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (computeUnit->shader->hsail_mode==Shader::SIMT) {
|
|
|
|
const int num_active_lanes = execMask().count();
|
|
|
|
computeUnit->controlFlowDivergenceDist.sample(num_active_lanes);
|
|
|
|
computeUnit->numVecOpsExecuted += num_active_lanes;
|
|
|
|
if (isGmInstruction(ii)) {
|
|
|
|
computeUnit->activeLanesPerGMemInstrDist.sample(num_active_lanes);
|
|
|
|
} else if (isLmInstruction(ii)) {
|
|
|
|
computeUnit->activeLanesPerLMemInstrDist.sample(num_active_lanes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// ---- Update Vector ALU pipeline and other resources ------------------ //
|
|
|
|
// Single precision ALU or Branch or Return or Special instruction
|
2016-10-27 04:47:11 +02:00
|
|
|
if (ii->isALU() || ii->isSpecialOp() ||
|
|
|
|
ii->isBranch() ||
|
2016-01-19 20:28:22 +01:00
|
|
|
// FIXME: Kernel argument loads are currently treated as ALU operations
|
|
|
|
// since we don't send memory packets at execution. If we fix that then
|
|
|
|
// we should map them to one of the memory pipelines
|
2016-10-27 04:47:11 +02:00
|
|
|
(ii->isKernArgSeg() && ii->isLoad()) ||
|
|
|
|
ii->isArgSeg() ||
|
|
|
|
ii->isReturn()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->aluPipe[simdId].set(computeUnit->shader->
|
|
|
|
ticks(computeUnit->spBypassLength()));
|
|
|
|
|
|
|
|
// this is to enforce a fixed number of cycles per issue slot per SIMD
|
|
|
|
computeUnit->wfWait[simdId].set(computeUnit->shader->
|
|
|
|
ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isBarrier()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->wfWait[simdId].set(computeUnit->shader->
|
|
|
|
ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isLoad() && ii->isFlat()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
|
|
|
|
|
|
|
if (Enums::SC_SHARED == ii->executedAs()) {
|
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
} else {
|
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
}
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isStore() && ii->isFlat()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
assert(Enums::SC_NONE != ii->executedAs());
|
|
|
|
if (Enums::SC_SHARED == ii->executedAs()) {
|
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
} else {
|
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
}
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isLoad() && ii->isGlobalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isStore() && ii->isGlobalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isGlobalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToGlobalMemPipeBus[computeUnit->nextGlbRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->GlbMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isLoad() && ii->isLocalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(4));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if (ii->isStore() && ii->isLocalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
2016-10-27 04:47:11 +02:00
|
|
|
} else if ((ii->isAtomic() || ii->isMemFence()) && ii->isLocalMem()) {
|
2016-01-19 20:28:22 +01:00
|
|
|
computeUnit->vrfToLocalMemPipeBus[computeUnit->nextLocRdBus()].
|
|
|
|
set(computeUnit->shader->ticks(8));
|
|
|
|
computeUnit->wfWait[computeUnit->ShrMemUnitId()].
|
|
|
|
set(computeUnit->shader->ticks(computeUnit->issuePeriod));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::waitingAtBarrier(int lane)
|
|
|
|
{
|
2016-09-16 18:26:52 +02:00
|
|
|
return barCnt[lane] < maxBarCnt;
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::pushToReconvergenceStack(uint32_t pc, uint32_t rpc,
|
|
|
|
const VectorMask& mask)
|
|
|
|
{
|
|
|
|
assert(mask.count());
|
2016-09-16 18:29:01 +02:00
|
|
|
reconvergenceStack.emplace_back(new ReconvergenceStackEntry{pc, rpc, mask});
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::popFromReconvergenceStack()
|
|
|
|
{
|
|
|
|
assert(!reconvergenceStack.empty());
|
|
|
|
|
|
|
|
DPRINTF(WavefrontStack, "[%2d, %2d, %2d, %2d] %s %3i => ",
|
|
|
|
computeUnit->cu_id, simdId, wfSlotId, wfDynId,
|
|
|
|
execMask().to_string<char, std::string::traits_type,
|
|
|
|
std::string::allocator_type>().c_str(), pc());
|
|
|
|
|
2016-09-16 18:29:01 +02:00
|
|
|
reconvergenceStack.pop_back();
|
2016-01-19 20:28:22 +01:00
|
|
|
|
|
|
|
DPRINTF(WavefrontStack, "%3i %s\n", pc(),
|
|
|
|
execMask().to_string<char, std::string::traits_type,
|
|
|
|
std::string::allocator_type>().c_str());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::discardFetch()
|
|
|
|
{
|
|
|
|
instructionBuffer.clear();
|
|
|
|
dropFetch |=pendingFetch;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t
|
|
|
|
Wavefront::pc() const
|
|
|
|
{
|
2016-09-16 18:29:01 +02:00
|
|
|
return reconvergenceStack.back()->pc;
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t
|
|
|
|
Wavefront::rpc() const
|
|
|
|
{
|
2016-09-16 18:29:01 +02:00
|
|
|
return reconvergenceStack.back()->rpc;
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
VectorMask
|
|
|
|
Wavefront::execMask() const
|
|
|
|
{
|
2016-09-16 18:29:01 +02:00
|
|
|
return reconvergenceStack.back()->execMask;
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
|
|
|
Wavefront::execMask(int lane) const
|
|
|
|
{
|
2016-09-16 18:29:01 +02:00
|
|
|
return reconvergenceStack.back()->execMask[lane];
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::pc(uint32_t new_pc)
|
|
|
|
{
|
2016-09-16 18:29:01 +02:00
|
|
|
reconvergenceStack.back()->pc = new_pc;
|
2016-01-19 20:28:22 +01:00
|
|
|
}
|
2016-09-16 18:27:56 +02:00
|
|
|
|
|
|
|
uint32_t
|
|
|
|
Wavefront::getStaticContextSize() const
|
|
|
|
{
|
2016-09-16 18:31:46 +02:00
|
|
|
return barCnt.size() * sizeof(int) + sizeof(wfId) + sizeof(maxBarCnt) +
|
2016-09-16 18:27:56 +02:00
|
|
|
sizeof(oldBarrierCnt) + sizeof(barrierCnt) + sizeof(wgId) +
|
|
|
|
sizeof(computeUnit->cu_id) + sizeof(barrierId) + sizeof(initMask) +
|
|
|
|
sizeof(privBase) + sizeof(spillBase) + sizeof(ldsChunk) +
|
|
|
|
computeUnit->wfSize() * sizeof(ReconvergenceStackEntry);
|
|
|
|
}
|
2016-09-16 18:32:36 +02:00
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::getContext(const void *out)
|
|
|
|
{
|
|
|
|
uint8_t *iter = (uint8_t *)out;
|
|
|
|
for (int i = 0; i < barCnt.size(); i++) {
|
|
|
|
*(int *)iter = barCnt[i]; iter += sizeof(barCnt[i]);
|
|
|
|
}
|
|
|
|
*(int *)iter = wfId; iter += sizeof(wfId);
|
|
|
|
*(int *)iter = maxBarCnt; iter += sizeof(maxBarCnt);
|
|
|
|
*(int *)iter = oldBarrierCnt; iter += sizeof(oldBarrierCnt);
|
|
|
|
*(int *)iter = barrierCnt; iter += sizeof(barrierCnt);
|
|
|
|
*(int *)iter = computeUnit->cu_id; iter += sizeof(computeUnit->cu_id);
|
|
|
|
*(uint32_t *)iter = wgId; iter += sizeof(wgId);
|
|
|
|
*(uint32_t *)iter = barrierId; iter += sizeof(barrierId);
|
|
|
|
*(uint64_t *)iter = initMask.to_ullong(); iter += sizeof(initMask.to_ullong());
|
|
|
|
*(Addr *)iter = privBase; iter += sizeof(privBase);
|
|
|
|
*(Addr *)iter = spillBase; iter += sizeof(spillBase);
|
|
|
|
|
|
|
|
int stackSize = reconvergenceStack.size();
|
|
|
|
ReconvergenceStackEntry empty = {std::numeric_limits<uint32_t>::max(),
|
|
|
|
std::numeric_limits<uint32_t>::max(),
|
|
|
|
std::numeric_limits<uint64_t>::max()};
|
|
|
|
for (int i = 0; i < workItemId[0].size(); i++) {
|
|
|
|
if (i < stackSize) {
|
|
|
|
*(ReconvergenceStackEntry *)iter = *reconvergenceStack.back();
|
|
|
|
iter += sizeof(ReconvergenceStackEntry);
|
|
|
|
reconvergenceStack.pop_back();
|
|
|
|
} else {
|
|
|
|
*(ReconvergenceStackEntry *)iter = empty;
|
|
|
|
iter += sizeof(ReconvergenceStackEntry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int wf_size = computeUnit->wfSize();
|
|
|
|
for (int i = 0; i < maxSpVgprs; i++) {
|
|
|
|
uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
|
|
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
|
|
uint32_t regVal = computeUnit->vrf[simdId]->
|
|
|
|
read<uint32_t>(vgprIdx,lane);
|
|
|
|
*(uint32_t *)iter = regVal; iter += sizeof(regVal);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < maxDpVgprs; i++) {
|
|
|
|
uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
|
|
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
|
|
uint64_t regVal = computeUnit->vrf[simdId]->
|
|
|
|
read<uint64_t>(vgprIdx,lane);
|
|
|
|
*(uint64_t *)iter = regVal; iter += sizeof(regVal);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < condRegState->numRegs(); i++) {
|
|
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
|
|
uint64_t regVal = condRegState->read<uint64_t>(i, lane);
|
|
|
|
*(uint64_t *)iter = regVal; iter += sizeof(regVal);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* saving LDS content */
|
|
|
|
if (ldsChunk)
|
|
|
|
for (int i = 0; i < ldsChunk->size(); i++) {
|
|
|
|
char val = ldsChunk->read<char>(i);
|
|
|
|
*(char *) iter = val; iter += sizeof(val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::setContext(const void *in)
|
|
|
|
{
|
|
|
|
uint8_t *iter = (uint8_t *)in;
|
|
|
|
for (int i = 0; i < barCnt.size(); i++) {
|
|
|
|
barCnt[i] = *(int *)iter; iter += sizeof(barCnt[i]);
|
|
|
|
}
|
|
|
|
wfId = *(int *)iter; iter += sizeof(wfId);
|
|
|
|
maxBarCnt = *(int *)iter; iter += sizeof(maxBarCnt);
|
|
|
|
oldBarrierCnt = *(int *)iter; iter += sizeof(oldBarrierCnt);
|
|
|
|
barrierCnt = *(int *)iter; iter += sizeof(barrierCnt);
|
|
|
|
computeUnit->cu_id = *(int *)iter; iter += sizeof(computeUnit->cu_id);
|
|
|
|
wgId = *(uint32_t *)iter; iter += sizeof(wgId);
|
|
|
|
barrierId = *(uint32_t *)iter; iter += sizeof(barrierId);
|
|
|
|
initMask = VectorMask(*(uint64_t *)iter); iter += sizeof(initMask);
|
|
|
|
privBase = *(Addr *)iter; iter += sizeof(privBase);
|
|
|
|
spillBase = *(Addr *)iter; iter += sizeof(spillBase);
|
|
|
|
|
|
|
|
for (int i = 0; i < workItemId[0].size(); i++) {
|
|
|
|
ReconvergenceStackEntry newEntry = *(ReconvergenceStackEntry *)iter;
|
|
|
|
iter += sizeof(ReconvergenceStackEntry);
|
|
|
|
if (newEntry.pc != std::numeric_limits<uint32_t>::max()) {
|
|
|
|
pushToReconvergenceStack(newEntry.pc, newEntry.rpc,
|
|
|
|
newEntry.execMask);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
int wf_size = computeUnit->wfSize();
|
|
|
|
|
|
|
|
for (int i = 0; i < maxSpVgprs; i++) {
|
|
|
|
uint32_t vgprIdx = remap(i, sizeof(uint32_t), 1);
|
|
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
|
|
uint32_t regVal = *(uint32_t *)iter; iter += sizeof(regVal);
|
|
|
|
computeUnit->vrf[simdId]->write<uint32_t>(vgprIdx, regVal, lane);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < maxDpVgprs; i++) {
|
|
|
|
uint32_t vgprIdx = remap(i, sizeof(uint64_t), 1);
|
|
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
|
|
uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
|
|
|
|
computeUnit->vrf[simdId]->write<uint64_t>(vgprIdx, regVal, lane);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < condRegState->numRegs(); i++) {
|
|
|
|
for (int lane = 0; lane < wf_size; lane++) {
|
|
|
|
uint64_t regVal = *(uint64_t *)iter; iter += sizeof(regVal);
|
|
|
|
condRegState->write<uint64_t>(i, lane, regVal);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/** Restoring LDS contents */
|
|
|
|
if (ldsChunk)
|
|
|
|
for (int i = 0; i < ldsChunk->size(); i++) {
|
|
|
|
char val = *(char *) iter; iter += sizeof(val);
|
|
|
|
ldsChunk->write<char>(i, val);
|
|
|
|
}
|
|
|
|
}
|
2016-10-04 19:03:52 +02:00
|
|
|
|
|
|
|
void
|
|
|
|
Wavefront::computeActualWgSz(NDRange *ndr)
|
|
|
|
{
|
|
|
|
actualWgSzTotal = 1;
|
|
|
|
for (int d = 0; d < 3; ++d) {
|
|
|
|
actualWgSz[d] = std::min(workGroupSz[d],
|
|
|
|
gridSz[d] - ndr->wgId[d] * workGroupSz[d]);
|
|
|
|
actualWgSzTotal *= actualWgSz[d];
|
|
|
|
}
|
|
|
|
}
|