de72e36619
this patch adds an ordered response buffer to the GM pipeline to ensure in-order data delivery. the buffer is implemented as a stl ordered map, which sorts the request in program order by using their sequence ID. when requests return to the GM pipeline they are marked as done. only the oldest request may be serviced from the ordered buffer, and only if is marked as done. the FIFO response buffers are kept and used in OoO delivery mode
1912 lines
64 KiB
C++
1912 lines
64 KiB
C++
/*
|
|
* Copyright (c) 2011-2015 Advanced Micro Devices, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* For use for simulation and test purposes only
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
* and/or other materials provided with the distribution.
|
|
*
|
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
* Author: John Kalamatianos, Anthony Gutierrez
|
|
*/
|
|
#include "gpu-compute/compute_unit.hh"
|
|
|
|
#include <limits>
|
|
|
|
#include "base/output.hh"
|
|
#include "debug/GPUDisp.hh"
|
|
#include "debug/GPUExec.hh"
|
|
#include "debug/GPUFetch.hh"
|
|
#include "debug/GPUMem.hh"
|
|
#include "debug/GPUPort.hh"
|
|
#include "debug/GPUPrefetch.hh"
|
|
#include "debug/GPUSync.hh"
|
|
#include "debug/GPUTLB.hh"
|
|
#include "gpu-compute/dispatcher.hh"
|
|
#include "gpu-compute/gpu_dyn_inst.hh"
|
|
#include "gpu-compute/gpu_static_inst.hh"
|
|
#include "gpu-compute/ndrange.hh"
|
|
#include "gpu-compute/shader.hh"
|
|
#include "gpu-compute/simple_pool_manager.hh"
|
|
#include "gpu-compute/vector_register_file.hh"
|
|
#include "gpu-compute/wavefront.hh"
|
|
#include "mem/page_table.hh"
|
|
#include "sim/process.hh"
|
|
|
|
ComputeUnit::ComputeUnit(const Params *p) : MemObject(p), fetchStage(p),
|
|
scoreboardCheckStage(p), scheduleStage(p), execStage(p),
|
|
globalMemoryPipe(p), localMemoryPipe(p), rrNextMemID(0), rrNextALUWp(0),
|
|
cu_id(p->cu_id), vrf(p->vector_register_file), numSIMDs(p->num_SIMDs),
|
|
spBypassPipeLength(p->spbypass_pipe_length),
|
|
dpBypassPipeLength(p->dpbypass_pipe_length),
|
|
issuePeriod(p->issue_period),
|
|
numGlbMemUnits(p->num_global_mem_pipes),
|
|
numLocMemUnits(p->num_shared_mem_pipes),
|
|
perLaneTLB(p->perLaneTLB), prefetchDepth(p->prefetch_depth),
|
|
prefetchStride(p->prefetch_stride), prefetchType(p->prefetch_prev_type),
|
|
xact_cas_mode(p->xactCasMode), debugSegFault(p->debugSegFault),
|
|
functionalTLB(p->functionalTLB), localMemBarrier(p->localMemBarrier),
|
|
countPages(p->countPages), barrier_id(0),
|
|
vrfToCoalescerBusWidth(p->vrf_to_coalescer_bus_width),
|
|
coalescerToVrfBusWidth(p->coalescer_to_vrf_bus_width),
|
|
req_tick_latency(p->mem_req_latency * p->clk_domain->clockPeriod()),
|
|
resp_tick_latency(p->mem_resp_latency * p->clk_domain->clockPeriod()),
|
|
_masterId(p->system->getMasterId(name() + ".ComputeUnit")),
|
|
lds(*p->localDataStore), _cacheLineSize(p->system->cacheLineSize()),
|
|
globalSeqNum(0), wavefrontSize(p->wfSize),
|
|
kernelLaunchInst(new KernelLaunchStaticInst())
|
|
{
|
|
/**
|
|
* This check is necessary because std::bitset only provides conversion
|
|
* to unsigned long or unsigned long long via to_ulong() or to_ullong().
|
|
* there are * a few places in the code where to_ullong() is used, however
|
|
* if VSZ is larger than a value the host can support then bitset will
|
|
* throw a runtime exception. we should remove all use of to_long() or
|
|
* to_ullong() so we can have VSZ greater than 64b, however until that is
|
|
* done this assert is required.
|
|
*/
|
|
fatal_if(p->wfSize > std::numeric_limits<unsigned long long>::digits ||
|
|
p->wfSize <= 0,
|
|
"WF size is larger than the host can support");
|
|
fatal_if(!isPowerOf2(wavefrontSize),
|
|
"Wavefront size should be a power of 2");
|
|
// calculate how many cycles a vector load or store will need to transfer
|
|
// its data over the corresponding buses
|
|
numCyclesPerStoreTransfer =
|
|
(uint32_t)ceil((double)(wfSize() * sizeof(uint32_t)) /
|
|
(double)vrfToCoalescerBusWidth);
|
|
|
|
numCyclesPerLoadTransfer = (wfSize() * sizeof(uint32_t))
|
|
/ coalescerToVrfBusWidth;
|
|
|
|
lastVaddrWF.resize(numSIMDs);
|
|
wfList.resize(numSIMDs);
|
|
|
|
for (int j = 0; j < numSIMDs; ++j) {
|
|
lastVaddrWF[j].resize(p->n_wf);
|
|
|
|
for (int i = 0; i < p->n_wf; ++i) {
|
|
lastVaddrWF[j][i].resize(wfSize());
|
|
|
|
wfList[j].push_back(p->wavefronts[j * p->n_wf + i]);
|
|
wfList[j][i]->setParent(this);
|
|
|
|
for (int k = 0; k < wfSize(); ++k) {
|
|
lastVaddrWF[j][i][k] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
lastVaddrSimd.resize(numSIMDs);
|
|
|
|
for (int i = 0; i < numSIMDs; ++i) {
|
|
lastVaddrSimd[i].resize(wfSize(), 0);
|
|
}
|
|
|
|
lastVaddrCU.resize(wfSize());
|
|
|
|
lds.setParent(this);
|
|
|
|
if (p->execPolicy == "OLDEST-FIRST") {
|
|
exec_policy = EXEC_POLICY::OLDEST;
|
|
} else if (p->execPolicy == "ROUND-ROBIN") {
|
|
exec_policy = EXEC_POLICY::RR;
|
|
} else {
|
|
fatal("Invalid WF execution policy (CU)\n");
|
|
}
|
|
|
|
memPort.resize(wfSize());
|
|
|
|
// resize the tlbPort vectorArray
|
|
int tlbPort_width = perLaneTLB ? wfSize() : 1;
|
|
tlbPort.resize(tlbPort_width);
|
|
|
|
cuExitCallback = new CUExitCallback(this);
|
|
registerExitCallback(cuExitCallback);
|
|
|
|
xactCasLoadMap.clear();
|
|
lastExecCycle.resize(numSIMDs, 0);
|
|
|
|
for (int i = 0; i < vrf.size(); ++i) {
|
|
vrf[i]->setParent(this);
|
|
}
|
|
|
|
numVecRegsPerSimd = vrf[0]->numRegs();
|
|
}
|
|
|
|
ComputeUnit::~ComputeUnit()
|
|
{
|
|
// Delete wavefront slots
|
|
for (int j = 0; j < numSIMDs; ++j) {
|
|
for (int i = 0; i < shader->n_wf; ++i) {
|
|
delete wfList[j][i];
|
|
}
|
|
lastVaddrSimd[j].clear();
|
|
}
|
|
lastVaddrCU.clear();
|
|
readyList.clear();
|
|
waveStatusList.clear();
|
|
dispatchList.clear();
|
|
vectorAluInstAvail.clear();
|
|
delete cuExitCallback;
|
|
delete ldsPort;
|
|
}
|
|
|
|
void
|
|
ComputeUnit::fillKernelState(Wavefront *w, NDRange *ndr)
|
|
{
|
|
w->resizeRegFiles(ndr->q.cRegCount, ndr->q.sRegCount, ndr->q.dRegCount);
|
|
|
|
w->workGroupSz[0] = ndr->q.wgSize[0];
|
|
w->workGroupSz[1] = ndr->q.wgSize[1];
|
|
w->workGroupSz[2] = ndr->q.wgSize[2];
|
|
w->wgSz = w->workGroupSz[0] * w->workGroupSz[1] * w->workGroupSz[2];
|
|
w->gridSz[0] = ndr->q.gdSize[0];
|
|
w->gridSz[1] = ndr->q.gdSize[1];
|
|
w->gridSz[2] = ndr->q.gdSize[2];
|
|
w->kernelArgs = ndr->q.args;
|
|
w->privSizePerItem = ndr->q.privMemPerItem;
|
|
w->spillSizePerItem = ndr->q.spillMemPerItem;
|
|
w->roBase = ndr->q.roMemStart;
|
|
w->roSize = ndr->q.roMemTotal;
|
|
w->computeActualWgSz(ndr);
|
|
}
|
|
|
|
void
|
|
ComputeUnit::updateEvents() {
|
|
|
|
if (!timestampVec.empty()) {
|
|
uint32_t vecSize = timestampVec.size();
|
|
uint32_t i = 0;
|
|
while (i < vecSize) {
|
|
if (timestampVec[i] <= shader->tick_cnt) {
|
|
std::pair<uint32_t, uint32_t> regInfo = regIdxVec[i];
|
|
vrf[regInfo.first]->markReg(regInfo.second, sizeof(uint32_t),
|
|
statusVec[i]);
|
|
timestampVec.erase(timestampVec.begin() + i);
|
|
regIdxVec.erase(regIdxVec.begin() + i);
|
|
statusVec.erase(statusVec.begin() + i);
|
|
--vecSize;
|
|
--i;
|
|
}
|
|
++i;
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i< numSIMDs; ++i) {
|
|
vrf[i]->updateEvents();
|
|
}
|
|
}
|
|
|
|
|
|
void
|
|
ComputeUnit::startWavefront(Wavefront *w, int waveId, LdsChunk *ldsChunk,
|
|
NDRange *ndr)
|
|
{
|
|
static int _n_wave = 0;
|
|
|
|
VectorMask init_mask;
|
|
init_mask.reset();
|
|
|
|
for (int k = 0; k < wfSize(); ++k) {
|
|
if (k + waveId * wfSize() < w->actualWgSzTotal)
|
|
init_mask[k] = 1;
|
|
}
|
|
|
|
w->kernId = ndr->dispatchId;
|
|
w->wfId = waveId;
|
|
w->initMask = init_mask.to_ullong();
|
|
|
|
for (int k = 0; k < wfSize(); ++k) {
|
|
w->workItemId[0][k] = (k + waveId * wfSize()) % w->actualWgSz[0];
|
|
w->workItemId[1][k] = ((k + waveId * wfSize()) / w->actualWgSz[0]) %
|
|
w->actualWgSz[1];
|
|
w->workItemId[2][k] = (k + waveId * wfSize()) /
|
|
(w->actualWgSz[0] * w->actualWgSz[1]);
|
|
|
|
w->workItemFlatId[k] = w->workItemId[2][k] * w->actualWgSz[0] *
|
|
w->actualWgSz[1] + w->workItemId[1][k] * w->actualWgSz[0] +
|
|
w->workItemId[0][k];
|
|
}
|
|
|
|
w->barrierSlots = divCeil(w->actualWgSzTotal, wfSize());
|
|
|
|
w->barCnt.resize(wfSize(), 0);
|
|
|
|
w->maxBarCnt = 0;
|
|
w->oldBarrierCnt = 0;
|
|
w->barrierCnt = 0;
|
|
|
|
w->privBase = ndr->q.privMemStart;
|
|
ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
|
|
|
|
w->spillBase = ndr->q.spillMemStart;
|
|
ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
|
|
|
|
w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
|
|
|
|
// WG state
|
|
w->wgId = ndr->globalWgId;
|
|
w->dispatchId = ndr->dispatchId;
|
|
w->workGroupId[0] = w->wgId % ndr->numWg[0];
|
|
w->workGroupId[1] = (w->wgId / ndr->numWg[0]) % ndr->numWg[1];
|
|
w->workGroupId[2] = w->wgId / (ndr->numWg[0] * ndr->numWg[1]);
|
|
|
|
w->barrierId = barrier_id;
|
|
w->stalledAtBarrier = false;
|
|
|
|
// set the wavefront context to have a pointer to this section of the LDS
|
|
w->ldsChunk = ldsChunk;
|
|
|
|
int32_t refCount M5_VAR_USED =
|
|
lds.increaseRefCounter(w->dispatchId, w->wgId);
|
|
DPRINTF(GPUDisp, "CU%d: increase ref ctr wg[%d] to [%d]\n",
|
|
cu_id, w->wgId, refCount);
|
|
|
|
w->instructionBuffer.clear();
|
|
|
|
if (w->pendingFetch)
|
|
w->dropFetch = true;
|
|
|
|
// is this the last wavefront in the workgroup
|
|
// if set the spillWidth to be the remaining work-items
|
|
// so that the vector access is correct
|
|
if ((waveId + 1) * wfSize() >= w->actualWgSzTotal) {
|
|
w->spillWidth = w->actualWgSzTotal - (waveId * wfSize());
|
|
} else {
|
|
w->spillWidth = wfSize();
|
|
}
|
|
|
|
DPRINTF(GPUDisp, "Scheduling wfDynId/barrier_id %d/%d on CU%d: "
|
|
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
|
|
|
|
w->start(++_n_wave, ndr->q.code_ptr);
|
|
}
|
|
|
|
void
|
|
ComputeUnit::StartWorkgroup(NDRange *ndr)
|
|
{
|
|
// reserve the LDS capacity allocated to the work group
|
|
// disambiguated by the dispatch ID and workgroup ID, which should be
|
|
// globally unique
|
|
LdsChunk *ldsChunk = lds.reserveSpace(ndr->dispatchId, ndr->globalWgId,
|
|
ndr->q.ldsSize);
|
|
|
|
// Send L1 cache acquire
|
|
// isKernel + isAcquire = Kernel Begin
|
|
if (shader->impl_kern_boundary_sync) {
|
|
GPUDynInstPtr gpuDynInst =
|
|
std::make_shared<GPUDynInst>(this, nullptr, kernelLaunchInst,
|
|
getAndIncSeqNum());
|
|
|
|
gpuDynInst->useContinuation = false;
|
|
injectGlobalMemFence(gpuDynInst, true);
|
|
}
|
|
|
|
// calculate the number of 32-bit vector registers required by wavefront
|
|
int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
|
int wave_id = 0;
|
|
|
|
// Assign WFs by spreading them across SIMDs, 1 WF per SIMD at a time
|
|
for (int m = 0; m < shader->n_wf * numSIMDs; ++m) {
|
|
Wavefront *w = wfList[m % numSIMDs][m / numSIMDs];
|
|
// Check if this wavefront slot is available:
|
|
// It must be stopped and not waiting
|
|
// for a release to complete S_RETURNING
|
|
if (w->status == Wavefront::S_STOPPED) {
|
|
fillKernelState(w, ndr);
|
|
// if we have scheduled all work items then stop
|
|
// scheduling wavefronts
|
|
if (wave_id * wfSize() >= w->actualWgSzTotal)
|
|
break;
|
|
|
|
// reserve vector registers for the scheduled wavefront
|
|
assert(vectorRegsReserved[m % numSIMDs] <= numVecRegsPerSimd);
|
|
uint32_t normSize = 0;
|
|
|
|
w->startVgprIndex = vrf[m % numSIMDs]->manager->
|
|
allocateRegion(vregDemand, &normSize);
|
|
|
|
w->reservedVectorRegs = normSize;
|
|
vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
|
|
|
|
startWavefront(w, wave_id, ldsChunk, ndr);
|
|
++wave_id;
|
|
}
|
|
}
|
|
++barrier_id;
|
|
}
|
|
|
|
int
|
|
ComputeUnit::ReadyWorkgroup(NDRange *ndr)
|
|
{
|
|
// Get true size of workgroup (after clamping to grid size)
|
|
int trueWgSize[3];
|
|
int trueWgSizeTotal = 1;
|
|
|
|
for (int d = 0; d < 3; ++d) {
|
|
trueWgSize[d] = std::min(ndr->q.wgSize[d], ndr->q.gdSize[d] -
|
|
ndr->wgId[d] * ndr->q.wgSize[d]);
|
|
|
|
trueWgSizeTotal *= trueWgSize[d];
|
|
DPRINTF(GPUDisp, "trueWgSize[%d] = %d\n", d, trueWgSize[d]);
|
|
}
|
|
|
|
DPRINTF(GPUDisp, "trueWgSizeTotal = %d\n", trueWgSizeTotal);
|
|
|
|
// calculate the number of 32-bit vector registers required by each
|
|
// work item of the work group
|
|
int vregDemandPerWI = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
|
bool vregAvail = true;
|
|
int numWfs = (trueWgSizeTotal + wfSize() - 1) / wfSize();
|
|
int freeWfSlots = 0;
|
|
// check if the total number of VGPRs required by all WFs of the WG
|
|
// fit in the VRFs of all SIMD units
|
|
assert((numWfs * vregDemandPerWI) <= (numSIMDs * numVecRegsPerSimd));
|
|
int numMappedWfs = 0;
|
|
std::vector<int> numWfsPerSimd;
|
|
numWfsPerSimd.resize(numSIMDs, 0);
|
|
// find how many free WF slots we have across all SIMDs
|
|
for (int j = 0; j < shader->n_wf; ++j) {
|
|
for (int i = 0; i < numSIMDs; ++i) {
|
|
if (wfList[i][j]->status == Wavefront::S_STOPPED) {
|
|
// count the number of free WF slots
|
|
++freeWfSlots;
|
|
if (numMappedWfs < numWfs) {
|
|
// count the WFs to be assigned per SIMD
|
|
numWfsPerSimd[i]++;
|
|
}
|
|
numMappedWfs++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// if there are enough free WF slots then find if there are enough
|
|
// free VGPRs per SIMD based on the WF->SIMD mapping
|
|
if (freeWfSlots >= numWfs) {
|
|
for (int j = 0; j < numSIMDs; ++j) {
|
|
// find if there are enough free VGPR regions in the SIMD's VRF
|
|
// to accommodate the WFs of the new WG that would be mapped to
|
|
// this SIMD unit
|
|
vregAvail = vrf[j]->manager->canAllocate(numWfsPerSimd[j],
|
|
vregDemandPerWI);
|
|
|
|
// stop searching if there is at least one SIMD
|
|
// whose VRF does not have enough free VGPR pools.
|
|
// This is because a WG is scheduled only if ALL
|
|
// of its WFs can be scheduled
|
|
if (!vregAvail)
|
|
break;
|
|
}
|
|
}
|
|
|
|
DPRINTF(GPUDisp, "Free WF slots = %d, VGPR Availability = %d\n",
|
|
freeWfSlots, vregAvail);
|
|
|
|
if (!vregAvail) {
|
|
++numTimesWgBlockedDueVgprAlloc;
|
|
}
|
|
|
|
// Return true if enough WF slots to submit workgroup and if there are
|
|
// enough VGPRs to schedule all WFs to their SIMD units
|
|
if (!lds.canReserve(ndr->q.ldsSize)) {
|
|
wgBlockedDueLdsAllocation++;
|
|
}
|
|
|
|
// Return true if (a) there are enough free WF slots to submit
|
|
// workgrounp and (b) if there are enough VGPRs to schedule all WFs to their
|
|
// SIMD units and (c) if there is enough space in LDS
|
|
return freeWfSlots >= numWfs && vregAvail && lds.canReserve(ndr->q.ldsSize);
|
|
}
|
|
|
|
int
|
|
ComputeUnit::AllAtBarrier(uint32_t _barrier_id, uint32_t bcnt, uint32_t bslots)
|
|
{
|
|
DPRINTF(GPUSync, "CU%d: Checking for All At Barrier\n", cu_id);
|
|
int ccnt = 0;
|
|
|
|
for (int i_simd = 0; i_simd < numSIMDs; ++i_simd) {
|
|
for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf) {
|
|
Wavefront *w = wfList[i_simd][i_wf];
|
|
|
|
if (w->status == Wavefront::S_RUNNING) {
|
|
DPRINTF(GPUSync, "Checking WF[%d][%d]\n", i_simd, i_wf);
|
|
|
|
DPRINTF(GPUSync, "wf->barrier_id = %d, _barrier_id = %d\n",
|
|
w->barrierId, _barrier_id);
|
|
|
|
DPRINTF(GPUSync, "wf->barrier_cnt %d, bcnt = %d\n",
|
|
w->barrierCnt, bcnt);
|
|
}
|
|
|
|
if (w->status == Wavefront::S_RUNNING &&
|
|
w->barrierId == _barrier_id && w->barrierCnt == bcnt &&
|
|
!w->outstandingReqs) {
|
|
++ccnt;
|
|
|
|
DPRINTF(GPUSync, "WF[%d][%d] at barrier, increment ccnt to "
|
|
"%d\n", i_simd, i_wf, ccnt);
|
|
}
|
|
}
|
|
}
|
|
|
|
DPRINTF(GPUSync, "CU%d: returning allAtBarrier ccnt = %d, bslots = %d\n",
|
|
cu_id, ccnt, bslots);
|
|
|
|
return ccnt == bslots;
|
|
}
|
|
|
|
// Check if the current wavefront is blocked on additional resources.
|
|
bool
|
|
ComputeUnit::cedeSIMD(int simdId, int wfSlotId)
|
|
{
|
|
bool cede = false;
|
|
|
|
// If --xact-cas-mode option is enabled in run.py, then xact_cas_ld
|
|
// magic instructions will impact the scheduling of wavefronts
|
|
if (xact_cas_mode) {
|
|
/*
|
|
* When a wavefront calls xact_cas_ld, it adds itself to a per address
|
|
* queue. All per address queues are managed by the xactCasLoadMap.
|
|
*
|
|
* A wavefront is not blocked if: it is not in ANY per address queue or
|
|
* if it is at the head of a per address queue.
|
|
*/
|
|
for (auto itMap : xactCasLoadMap) {
|
|
std::list<waveIdentifier> curWaveIDQueue = itMap.second.waveIDQueue;
|
|
|
|
if (!curWaveIDQueue.empty()) {
|
|
for (auto it : curWaveIDQueue) {
|
|
waveIdentifier cur_wave = it;
|
|
|
|
if (cur_wave.simdId == simdId &&
|
|
cur_wave.wfSlotId == wfSlotId) {
|
|
// 2 possibilities
|
|
// 1: this WF has a green light
|
|
// 2: another WF has a green light
|
|
waveIdentifier owner_wave = curWaveIDQueue.front();
|
|
|
|
if (owner_wave.simdId != cur_wave.simdId ||
|
|
owner_wave.wfSlotId != cur_wave.wfSlotId) {
|
|
// possibility 2
|
|
cede = true;
|
|
break;
|
|
} else {
|
|
// possibility 1
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return cede;
|
|
}
|
|
|
|
// Execute one clock worth of work on the ComputeUnit.
|
|
void
|
|
ComputeUnit::exec()
|
|
{
|
|
updateEvents();
|
|
// Execute pipeline stages in reverse order to simulate
|
|
// the pipeline latency
|
|
globalMemoryPipe.exec();
|
|
localMemoryPipe.exec();
|
|
execStage.exec();
|
|
scheduleStage.exec();
|
|
scoreboardCheckStage.exec();
|
|
fetchStage.exec();
|
|
|
|
totalCycles++;
|
|
}
|
|
|
|
void
|
|
ComputeUnit::init()
|
|
{
|
|
// Initialize CU Bus models
|
|
glbMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
|
|
locMemToVrfBus.init(&shader->tick_cnt, shader->ticks(1));
|
|
nextGlbMemBus = 0;
|
|
nextLocMemBus = 0;
|
|
fatal_if(numGlbMemUnits > 1,
|
|
"No support for multiple Global Memory Pipelines exists!!!");
|
|
vrfToGlobalMemPipeBus.resize(numGlbMemUnits);
|
|
for (int j = 0; j < numGlbMemUnits; ++j) {
|
|
vrfToGlobalMemPipeBus[j] = WaitClass();
|
|
vrfToGlobalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
|
|
}
|
|
|
|
fatal_if(numLocMemUnits > 1,
|
|
"No support for multiple Local Memory Pipelines exists!!!");
|
|
vrfToLocalMemPipeBus.resize(numLocMemUnits);
|
|
for (int j = 0; j < numLocMemUnits; ++j) {
|
|
vrfToLocalMemPipeBus[j] = WaitClass();
|
|
vrfToLocalMemPipeBus[j].init(&shader->tick_cnt, shader->ticks(1));
|
|
}
|
|
vectorRegsReserved.resize(numSIMDs, 0);
|
|
aluPipe.resize(numSIMDs);
|
|
wfWait.resize(numSIMDs + numLocMemUnits + numGlbMemUnits);
|
|
|
|
for (int i = 0; i < numSIMDs + numLocMemUnits + numGlbMemUnits; ++i) {
|
|
wfWait[i] = WaitClass();
|
|
wfWait[i].init(&shader->tick_cnt, shader->ticks(1));
|
|
}
|
|
|
|
for (int i = 0; i < numSIMDs; ++i) {
|
|
aluPipe[i] = WaitClass();
|
|
aluPipe[i].init(&shader->tick_cnt, shader->ticks(1));
|
|
}
|
|
|
|
// Setup space for call args
|
|
for (int j = 0; j < numSIMDs; ++j) {
|
|
for (int i = 0; i < shader->n_wf; ++i) {
|
|
wfList[j][i]->initCallArgMem(shader->funcargs_size, wavefrontSize);
|
|
}
|
|
}
|
|
|
|
// Initializing pipeline resources
|
|
readyList.resize(numSIMDs + numGlbMemUnits + numLocMemUnits);
|
|
waveStatusList.resize(numSIMDs);
|
|
|
|
for (int j = 0; j < numSIMDs; ++j) {
|
|
for (int i = 0; i < shader->n_wf; ++i) {
|
|
waveStatusList[j].push_back(
|
|
std::make_pair(wfList[j][i], BLOCKED));
|
|
}
|
|
}
|
|
|
|
for (int j = 0; j < (numSIMDs + numGlbMemUnits + numLocMemUnits); ++j) {
|
|
dispatchList.push_back(std::make_pair((Wavefront*)nullptr, EMPTY));
|
|
}
|
|
|
|
fetchStage.init(this);
|
|
scoreboardCheckStage.init(this);
|
|
scheduleStage.init(this);
|
|
execStage.init(this);
|
|
globalMemoryPipe.init(this);
|
|
localMemoryPipe.init(this);
|
|
// initialize state for statistics calculation
|
|
vectorAluInstAvail.resize(numSIMDs, false);
|
|
shrMemInstAvail = 0;
|
|
glbMemInstAvail = 0;
|
|
}
|
|
|
|
bool
|
|
ComputeUnit::DataPort::recvTimingResp(PacketPtr pkt)
|
|
{
|
|
// Ruby has completed the memory op. Schedule the mem_resp_event at the
|
|
// appropriate cycle to process the timing memory response
|
|
// This delay represents the pipeline delay
|
|
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
|
|
int index = sender_state->port_index;
|
|
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
|
|
|
|
// Is the packet returned a Kernel End or Barrier
|
|
if (pkt->req->isKernel() && pkt->req->isRelease()) {
|
|
Wavefront *w =
|
|
computeUnit->wfList[gpuDynInst->simdId][gpuDynInst->wfSlotId];
|
|
|
|
// Check if we are waiting on Kernel End Release
|
|
if (w->status == Wavefront::S_RETURNING) {
|
|
DPRINTF(GPUDisp, "CU%d: WF[%d][%d][wv=%d]: WG id completed %d\n",
|
|
computeUnit->cu_id, w->simdId, w->wfSlotId,
|
|
w->wfDynId, w->kernId);
|
|
|
|
computeUnit->shader->dispatcher->notifyWgCompl(w);
|
|
w->status = Wavefront::S_STOPPED;
|
|
} else {
|
|
w->outstandingReqs--;
|
|
}
|
|
|
|
DPRINTF(GPUSync, "CU%d: WF[%d][%d]: barrier_cnt = %d\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, w->barrierCnt);
|
|
|
|
if (gpuDynInst->useContinuation) {
|
|
assert(!gpuDynInst->isNoScope());
|
|
gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
|
|
gpuDynInst);
|
|
}
|
|
|
|
delete pkt->senderState;
|
|
delete pkt->req;
|
|
delete pkt;
|
|
return true;
|
|
} else if (pkt->req->isKernel() && pkt->req->isAcquire()) {
|
|
if (gpuDynInst->useContinuation) {
|
|
assert(!gpuDynInst->isNoScope());
|
|
gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
|
|
gpuDynInst);
|
|
}
|
|
|
|
delete pkt->senderState;
|
|
delete pkt->req;
|
|
delete pkt;
|
|
return true;
|
|
}
|
|
|
|
ComputeUnit::DataPort::MemRespEvent *mem_resp_event =
|
|
new ComputeUnit::DataPort::MemRespEvent(computeUnit->memPort[index],
|
|
pkt);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x received!\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
|
|
index, pkt->req->getPaddr());
|
|
|
|
computeUnit->schedule(mem_resp_event,
|
|
curTick() + computeUnit->resp_tick_latency);
|
|
return true;
|
|
}
|
|
|
|
void
|
|
ComputeUnit::DataPort::recvReqRetry()
|
|
{
|
|
int len = retries.size();
|
|
|
|
assert(len > 0);
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
PacketPtr pkt = retries.front().first;
|
|
GPUDynInstPtr gpuDynInst M5_VAR_USED = retries.front().second;
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: retry mem inst addr %#x\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
|
|
pkt->req->getPaddr());
|
|
|
|
/** Currently Ruby can return false due to conflicts for the particular
|
|
* cache block or address. Thus other requests should be allowed to
|
|
* pass and the data port should expect multiple retries. */
|
|
if (!sendTimingReq(pkt)) {
|
|
DPRINTF(GPUMem, "failed again!\n");
|
|
break;
|
|
} else {
|
|
DPRINTF(GPUMem, "successful!\n");
|
|
retries.pop_front();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
ComputeUnit::SQCPort::recvTimingResp(PacketPtr pkt)
|
|
{
|
|
computeUnit->fetchStage.processFetchReturn(pkt);
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
ComputeUnit::SQCPort::recvReqRetry()
|
|
{
|
|
int len = retries.size();
|
|
|
|
assert(len > 0);
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
PacketPtr pkt = retries.front().first;
|
|
Wavefront *wavefront M5_VAR_USED = retries.front().second;
|
|
DPRINTF(GPUFetch, "CU%d: WF[%d][%d]: retrying FETCH addr %#x\n",
|
|
computeUnit->cu_id, wavefront->simdId, wavefront->wfSlotId,
|
|
pkt->req->getPaddr());
|
|
if (!sendTimingReq(pkt)) {
|
|
DPRINTF(GPUFetch, "failed again!\n");
|
|
break;
|
|
} else {
|
|
DPRINTF(GPUFetch, "successful!\n");
|
|
retries.pop_front();
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
ComputeUnit::sendRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
|
|
{
|
|
// There must be a way around this check to do the globalMemStart...
|
|
Addr tmp_vaddr = pkt->req->getVaddr();
|
|
|
|
updatePageDivergenceDist(tmp_vaddr);
|
|
|
|
pkt->req->setVirt(pkt->req->getAsid(), tmp_vaddr, pkt->req->getSize(),
|
|
pkt->req->getFlags(), pkt->req->masterId(),
|
|
pkt->req->getPC());
|
|
|
|
// figure out the type of the request to set read/write
|
|
BaseTLB::Mode TLB_mode;
|
|
assert(pkt->isRead() || pkt->isWrite());
|
|
|
|
// Check write before read for atomic operations
|
|
// since atomic operations should use BaseTLB::Write
|
|
if (pkt->isWrite()){
|
|
TLB_mode = BaseTLB::Write;
|
|
} else if (pkt->isRead()) {
|
|
TLB_mode = BaseTLB::Read;
|
|
} else {
|
|
fatal("pkt is not a read nor a write\n");
|
|
}
|
|
|
|
tlbCycles -= curTick();
|
|
++tlbRequests;
|
|
|
|
int tlbPort_index = perLaneTLB ? index : 0;
|
|
|
|
if (shader->timingSim) {
|
|
if (debugSegFault) {
|
|
Process *p = shader->gpuTc->getProcessPtr();
|
|
Addr vaddr = pkt->req->getVaddr();
|
|
unsigned size = pkt->getSize();
|
|
|
|
if ((vaddr + size - 1) % 64 < vaddr % 64) {
|
|
panic("CU%d: WF[%d][%d]: Access to addr %#x is unaligned!\n",
|
|
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, vaddr);
|
|
}
|
|
|
|
Addr paddr;
|
|
|
|
if (!p->pTable->translate(vaddr, paddr)) {
|
|
if (!p->fixupStackFault(vaddr)) {
|
|
panic("CU%d: WF[%d][%d]: Fault on addr %#x!\n",
|
|
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
|
|
vaddr);
|
|
}
|
|
}
|
|
}
|
|
|
|
// This is the SenderState needed upon return
|
|
pkt->senderState = new DTLBPort::SenderState(gpuDynInst, index);
|
|
|
|
// This is the senderState needed by the TLB hierarchy to function
|
|
TheISA::GpuTLB::TranslationState *translation_state =
|
|
new TheISA::GpuTLB::TranslationState(TLB_mode, shader->gpuTc, false,
|
|
pkt->senderState);
|
|
|
|
pkt->senderState = translation_state;
|
|
|
|
if (functionalTLB) {
|
|
tlbPort[tlbPort_index]->sendFunctional(pkt);
|
|
|
|
// update the hitLevel distribution
|
|
int hit_level = translation_state->hitLevel;
|
|
assert(hit_level != -1);
|
|
hitsPerTLBLevel[hit_level]++;
|
|
|
|
// New SenderState for the memory access
|
|
X86ISA::GpuTLB::TranslationState *sender_state =
|
|
safe_cast<X86ISA::GpuTLB::TranslationState*>(pkt->senderState);
|
|
|
|
delete sender_state->tlbEntry;
|
|
delete sender_state->saved;
|
|
delete sender_state;
|
|
|
|
assert(pkt->req->hasPaddr());
|
|
assert(pkt->req->hasSize());
|
|
|
|
uint8_t *tmpData = pkt->getPtr<uint8_t>();
|
|
|
|
// this is necessary because the GPU TLB receives packets instead
|
|
// of requests. when the translation is complete, all relevent
|
|
// fields in the request will be populated, but not in the packet.
|
|
// here we create the new packet so we can set the size, addr,
|
|
// and proper flags.
|
|
PacketPtr oldPkt = pkt;
|
|
pkt = new Packet(oldPkt->req, oldPkt->cmd);
|
|
delete oldPkt;
|
|
pkt->dataStatic(tmpData);
|
|
|
|
|
|
// New SenderState for the memory access
|
|
pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst,
|
|
index, nullptr);
|
|
|
|
gpuDynInst->memStatusVector[pkt->getAddr()].push_back(index);
|
|
gpuDynInst->tlbHitLevel[index] = hit_level;
|
|
|
|
|
|
// translation is done. Schedule the mem_req_event at the
|
|
// appropriate cycle to send the timing memory request to ruby
|
|
ComputeUnit::DataPort::MemReqEvent *mem_req_event =
|
|
new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data "
|
|
"scheduled\n", cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, index, pkt->req->getPaddr());
|
|
|
|
schedule(mem_req_event, curTick() + req_tick_latency);
|
|
} else if (tlbPort[tlbPort_index]->isStalled()) {
|
|
assert(tlbPort[tlbPort_index]->retries.size() > 0);
|
|
|
|
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
|
|
"failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
|
|
tmp_vaddr);
|
|
|
|
tlbPort[tlbPort_index]->retries.push_back(pkt);
|
|
} else if (!tlbPort[tlbPort_index]->sendTimingReq(pkt)) {
|
|
// Stall the data port;
|
|
// No more packet will be issued till
|
|
// ruby indicates resources are freed by
|
|
// a recvReqRetry() call back on this port.
|
|
tlbPort[tlbPort_index]->stallPort();
|
|
|
|
DPRINTF(GPUTLB, "CU%d: WF[%d][%d]: Translation for addr %#x "
|
|
"failed!\n", cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
|
|
tmp_vaddr);
|
|
|
|
tlbPort[tlbPort_index]->retries.push_back(pkt);
|
|
} else {
|
|
DPRINTF(GPUTLB,
|
|
"CU%d: WF[%d][%d]: Translation for addr %#x sent!\n",
|
|
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, tmp_vaddr);
|
|
}
|
|
} else {
|
|
if (pkt->cmd == MemCmd::MemFenceReq) {
|
|
gpuDynInst->statusBitVector = VectorMask(0);
|
|
} else {
|
|
gpuDynInst->statusBitVector &= (~(1ll << index));
|
|
}
|
|
|
|
// New SenderState for the memory access
|
|
delete pkt->senderState;
|
|
|
|
// Because it's atomic operation, only need TLB translation state
|
|
pkt->senderState = new TheISA::GpuTLB::TranslationState(TLB_mode,
|
|
shader->gpuTc);
|
|
|
|
tlbPort[tlbPort_index]->sendFunctional(pkt);
|
|
|
|
// the addr of the packet is not modified, so we need to create a new
|
|
// packet, or otherwise the memory access will have the old virtual
|
|
// address sent in the translation packet, instead of the physical
|
|
// address returned by the translation.
|
|
PacketPtr new_pkt = new Packet(pkt->req, pkt->cmd);
|
|
new_pkt->dataStatic(pkt->getPtr<uint8_t>());
|
|
|
|
// Translation is done. It is safe to send the packet to memory.
|
|
memPort[0]->sendFunctional(new_pkt);
|
|
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: index %d: addr %#x\n", cu_id,
|
|
gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
|
|
new_pkt->req->getPaddr());
|
|
|
|
// safe_cast the senderState
|
|
TheISA::GpuTLB::TranslationState *sender_state =
|
|
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
|
|
|
delete sender_state->tlbEntry;
|
|
delete new_pkt;
|
|
delete pkt->senderState;
|
|
delete pkt->req;
|
|
delete pkt;
|
|
}
|
|
}
|
|
|
|
void
|
|
ComputeUnit::sendSyncRequest(GPUDynInstPtr gpuDynInst, int index, PacketPtr pkt)
|
|
{
|
|
ComputeUnit::DataPort::MemReqEvent *mem_req_event =
|
|
new ComputeUnit::DataPort::MemReqEvent(memPort[index], pkt);
|
|
|
|
|
|
// New SenderState for the memory access
|
|
pkt->senderState = new ComputeUnit::DataPort::SenderState(gpuDynInst, index,
|
|
nullptr);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x sync scheduled\n",
|
|
cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId, index,
|
|
pkt->req->getPaddr());
|
|
|
|
schedule(mem_req_event, curTick() + req_tick_latency);
|
|
}
|
|
|
|
void
|
|
ComputeUnit::injectGlobalMemFence(GPUDynInstPtr gpuDynInst, bool kernelLaunch,
|
|
Request* req)
|
|
{
|
|
assert(gpuDynInst->isGlobalSeg());
|
|
|
|
if (!req) {
|
|
req = new Request(0, 0, 0, 0, masterId(), 0, gpuDynInst->wfDynId);
|
|
}
|
|
req->setPaddr(0);
|
|
if (kernelLaunch) {
|
|
req->setFlags(Request::KERNEL);
|
|
}
|
|
|
|
// for non-kernel MemFence operations, memorder flags are set depending
|
|
// on which type of request is currently being sent, so this
|
|
// should be set by the caller (e.g. if an inst has acq-rel
|
|
// semantics, it will send one acquire req an one release req)
|
|
gpuDynInst->setRequestFlags(req, kernelLaunch);
|
|
|
|
// a mem fence must correspond to an acquire/release request
|
|
assert(req->isAcquire() || req->isRelease());
|
|
|
|
// create packet
|
|
PacketPtr pkt = new Packet(req, MemCmd::MemFenceReq);
|
|
|
|
// set packet's sender state
|
|
pkt->senderState =
|
|
new ComputeUnit::DataPort::SenderState(gpuDynInst, 0, nullptr);
|
|
|
|
// send the packet
|
|
sendSyncRequest(gpuDynInst, 0, pkt);
|
|
}
|
|
|
|
const char*
|
|
ComputeUnit::DataPort::MemRespEvent::description() const
|
|
{
|
|
return "ComputeUnit memory response event";
|
|
}
|
|
|
|
void
|
|
ComputeUnit::DataPort::MemRespEvent::process()
|
|
{
|
|
DataPort::SenderState *sender_state =
|
|
safe_cast<DataPort::SenderState*>(pkt->senderState);
|
|
|
|
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
|
|
ComputeUnit *compute_unit = dataPort->computeUnit;
|
|
|
|
assert(gpuDynInst);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: Response for addr %#x, index %d\n",
|
|
compute_unit->cu_id, gpuDynInst->simdId, gpuDynInst->wfSlotId,
|
|
pkt->req->getPaddr(), dataPort->index);
|
|
|
|
Addr paddr = pkt->req->getPaddr();
|
|
|
|
if (pkt->cmd != MemCmd::MemFenceResp) {
|
|
int index = gpuDynInst->memStatusVector[paddr].back();
|
|
|
|
DPRINTF(GPUMem, "Response for addr %#x, index %d\n",
|
|
pkt->req->getPaddr(), index);
|
|
|
|
gpuDynInst->memStatusVector[paddr].pop_back();
|
|
gpuDynInst->pAddr = pkt->req->getPaddr();
|
|
|
|
if (pkt->isRead() || pkt->isWrite()) {
|
|
|
|
if (gpuDynInst->n_reg <= MAX_REGS_FOR_NON_VEC_MEM_INST) {
|
|
gpuDynInst->statusBitVector &= (~(1ULL << index));
|
|
} else {
|
|
assert(gpuDynInst->statusVector[index] > 0);
|
|
gpuDynInst->statusVector[index]--;
|
|
|
|
if (!gpuDynInst->statusVector[index])
|
|
gpuDynInst->statusBitVector &= (~(1ULL << index));
|
|
}
|
|
|
|
DPRINTF(GPUMem, "bitvector is now %#x\n",
|
|
gpuDynInst->statusBitVector);
|
|
|
|
if (gpuDynInst->statusBitVector == VectorMask(0)) {
|
|
auto iter = gpuDynInst->memStatusVector.begin();
|
|
auto end = gpuDynInst->memStatusVector.end();
|
|
|
|
while (iter != end) {
|
|
assert(iter->second.empty());
|
|
++iter;
|
|
}
|
|
|
|
gpuDynInst->memStatusVector.clear();
|
|
|
|
if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
|
|
gpuDynInst->statusVector.clear();
|
|
|
|
compute_unit->globalMemoryPipe.handleResponse(gpuDynInst);
|
|
|
|
DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n",
|
|
compute_unit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId);
|
|
|
|
// after clearing the status vectors,
|
|
// see if there is a continuation to perform
|
|
// the continuation may generate more work for
|
|
// this memory request
|
|
if (gpuDynInst->useContinuation) {
|
|
assert(!gpuDynInst->isNoScope());
|
|
gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
|
|
gpuDynInst);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
gpuDynInst->statusBitVector = VectorMask(0);
|
|
|
|
if (gpuDynInst->useContinuation) {
|
|
assert(!gpuDynInst->isNoScope());
|
|
gpuDynInst->execContinuation(gpuDynInst->staticInstruction(),
|
|
gpuDynInst);
|
|
}
|
|
}
|
|
|
|
delete pkt->senderState;
|
|
delete pkt->req;
|
|
delete pkt;
|
|
}
|
|
|
|
ComputeUnit*
|
|
ComputeUnitParams::create()
|
|
{
|
|
return new ComputeUnit(this);
|
|
}
|
|
|
|
bool
|
|
ComputeUnit::DTLBPort::recvTimingResp(PacketPtr pkt)
|
|
{
|
|
Addr line = pkt->req->getPaddr();
|
|
|
|
DPRINTF(GPUTLB, "CU%d: DTLBPort received %#x->%#x\n", computeUnit->cu_id,
|
|
pkt->req->getVaddr(), line);
|
|
|
|
assert(pkt->senderState);
|
|
computeUnit->tlbCycles += curTick();
|
|
|
|
// pop off the TLB translation state
|
|
TheISA::GpuTLB::TranslationState *translation_state =
|
|
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
|
|
|
// no PageFaults are permitted for data accesses
|
|
if (!translation_state->tlbEntry->valid) {
|
|
DTLBPort::SenderState *sender_state =
|
|
safe_cast<DTLBPort::SenderState*>(translation_state->saved);
|
|
|
|
Wavefront *w M5_VAR_USED =
|
|
computeUnit->wfList[sender_state->_gpuDynInst->simdId]
|
|
[sender_state->_gpuDynInst->wfSlotId];
|
|
|
|
DPRINTFN("Wave %d couldn't tranlate vaddr %#x\n", w->wfDynId,
|
|
pkt->req->getVaddr());
|
|
}
|
|
|
|
assert(translation_state->tlbEntry->valid);
|
|
|
|
// update the hitLevel distribution
|
|
int hit_level = translation_state->hitLevel;
|
|
computeUnit->hitsPerTLBLevel[hit_level]++;
|
|
|
|
delete translation_state->tlbEntry;
|
|
assert(!translation_state->ports.size());
|
|
pkt->senderState = translation_state->saved;
|
|
|
|
// for prefetch pkt
|
|
BaseTLB::Mode TLB_mode = translation_state->tlbMode;
|
|
|
|
delete translation_state;
|
|
|
|
// use the original sender state to know how to close this transaction
|
|
DTLBPort::SenderState *sender_state =
|
|
safe_cast<DTLBPort::SenderState*>(pkt->senderState);
|
|
|
|
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
|
|
int mp_index = sender_state->portIndex;
|
|
Addr vaddr = pkt->req->getVaddr();
|
|
gpuDynInst->memStatusVector[line].push_back(mp_index);
|
|
gpuDynInst->tlbHitLevel[mp_index] = hit_level;
|
|
|
|
MemCmd requestCmd;
|
|
|
|
if (pkt->cmd == MemCmd::ReadResp) {
|
|
requestCmd = MemCmd::ReadReq;
|
|
} else if (pkt->cmd == MemCmd::WriteResp) {
|
|
requestCmd = MemCmd::WriteReq;
|
|
} else if (pkt->cmd == MemCmd::SwapResp) {
|
|
requestCmd = MemCmd::SwapReq;
|
|
} else {
|
|
panic("unsupported response to request conversion %s\n",
|
|
pkt->cmd.toString());
|
|
}
|
|
|
|
if (computeUnit->prefetchDepth) {
|
|
int simdId = gpuDynInst->simdId;
|
|
int wfSlotId = gpuDynInst->wfSlotId;
|
|
Addr last = 0;
|
|
|
|
switch(computeUnit->prefetchType) {
|
|
case Enums::PF_CU:
|
|
last = computeUnit->lastVaddrCU[mp_index];
|
|
break;
|
|
case Enums::PF_PHASE:
|
|
last = computeUnit->lastVaddrSimd[simdId][mp_index];
|
|
break;
|
|
case Enums::PF_WF:
|
|
last = computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index];
|
|
default:
|
|
break;
|
|
}
|
|
|
|
DPRINTF(GPUPrefetch, "CU[%d][%d][%d][%d]: %#x was last\n",
|
|
computeUnit->cu_id, simdId, wfSlotId, mp_index, last);
|
|
|
|
int stride = last ? (roundDown(vaddr, TheISA::PageBytes) -
|
|
roundDown(last, TheISA::PageBytes)) >> TheISA::PageShift
|
|
: 0;
|
|
|
|
DPRINTF(GPUPrefetch, "Stride is %d\n", stride);
|
|
|
|
computeUnit->lastVaddrCU[mp_index] = vaddr;
|
|
computeUnit->lastVaddrSimd[simdId][mp_index] = vaddr;
|
|
computeUnit->lastVaddrWF[simdId][wfSlotId][mp_index] = vaddr;
|
|
|
|
stride = (computeUnit->prefetchType == Enums::PF_STRIDE) ?
|
|
computeUnit->prefetchStride: stride;
|
|
|
|
DPRINTF(GPUPrefetch, "%#x to: CU[%d][%d][%d][%d]\n", vaddr,
|
|
computeUnit->cu_id, simdId, wfSlotId, mp_index);
|
|
|
|
DPRINTF(GPUPrefetch, "Prefetching from %#x:", vaddr);
|
|
|
|
// Prefetch Next few pages atomically
|
|
for (int pf = 1; pf <= computeUnit->prefetchDepth; ++pf) {
|
|
DPRINTF(GPUPrefetch, "%d * %d: %#x\n", pf, stride,
|
|
vaddr+stride*pf*TheISA::PageBytes);
|
|
|
|
if (!stride)
|
|
break;
|
|
|
|
Request *prefetch_req = new Request(0, vaddr + stride * pf *
|
|
TheISA::PageBytes,
|
|
sizeof(uint8_t), 0,
|
|
computeUnit->masterId(),
|
|
0, 0, 0);
|
|
|
|
PacketPtr prefetch_pkt = new Packet(prefetch_req, requestCmd);
|
|
uint8_t foo = 0;
|
|
prefetch_pkt->dataStatic(&foo);
|
|
|
|
// Because it's atomic operation, only need TLB translation state
|
|
prefetch_pkt->senderState =
|
|
new TheISA::GpuTLB::TranslationState(TLB_mode,
|
|
computeUnit->shader->gpuTc,
|
|
true);
|
|
|
|
// Currently prefetches are zero-latency, hence the sendFunctional
|
|
sendFunctional(prefetch_pkt);
|
|
|
|
/* safe_cast the senderState */
|
|
TheISA::GpuTLB::TranslationState *tlb_state =
|
|
safe_cast<TheISA::GpuTLB::TranslationState*>(
|
|
prefetch_pkt->senderState);
|
|
|
|
|
|
delete tlb_state->tlbEntry;
|
|
delete tlb_state;
|
|
delete prefetch_pkt->req;
|
|
delete prefetch_pkt;
|
|
}
|
|
}
|
|
|
|
// First we must convert the response cmd back to a request cmd so that
|
|
// the request can be sent through the cu's master port
|
|
PacketPtr new_pkt = new Packet(pkt->req, requestCmd);
|
|
new_pkt->dataStatic(pkt->getPtr<uint8_t>());
|
|
delete pkt->senderState;
|
|
delete pkt;
|
|
|
|
// New SenderState for the memory access
|
|
new_pkt->senderState =
|
|
new ComputeUnit::DataPort::SenderState(gpuDynInst, mp_index,
|
|
nullptr);
|
|
|
|
// translation is done. Schedule the mem_req_event at the appropriate
|
|
// cycle to send the timing memory request to ruby
|
|
ComputeUnit::DataPort::MemReqEvent *mem_req_event =
|
|
new ComputeUnit::DataPort::MemReqEvent(computeUnit->memPort[mp_index],
|
|
new_pkt);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: index %d, addr %#x data scheduled\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, mp_index, new_pkt->req->getPaddr());
|
|
|
|
computeUnit->schedule(mem_req_event, curTick() +
|
|
computeUnit->req_tick_latency);
|
|
|
|
return true;
|
|
}
|
|
|
|
const char*
|
|
ComputeUnit::DataPort::MemReqEvent::description() const
|
|
{
|
|
return "ComputeUnit memory request event";
|
|
}
|
|
|
|
void
|
|
ComputeUnit::DataPort::MemReqEvent::process()
|
|
{
|
|
SenderState *sender_state = safe_cast<SenderState*>(pkt->senderState);
|
|
GPUDynInstPtr gpuDynInst = sender_state->_gpuDynInst;
|
|
ComputeUnit *compute_unit M5_VAR_USED = dataPort->computeUnit;
|
|
|
|
if (!(dataPort->sendTimingReq(pkt))) {
|
|
dataPort->retries.push_back(std::make_pair(pkt, gpuDynInst));
|
|
|
|
DPRINTF(GPUPort,
|
|
"CU%d: WF[%d][%d]: index %d, addr %#x data req failed!\n",
|
|
compute_unit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, dataPort->index,
|
|
pkt->req->getPaddr());
|
|
} else {
|
|
DPRINTF(GPUPort,
|
|
"CU%d: WF[%d][%d]: index %d, addr %#x data req sent!\n",
|
|
compute_unit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, dataPort->index,
|
|
pkt->req->getPaddr());
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The initial translation request could have been rejected,
|
|
* if <retries> queue is not Retry sending the translation
|
|
* request. sendRetry() is called from the peer port whenever
|
|
* a translation completes.
|
|
*/
|
|
void
|
|
ComputeUnit::DTLBPort::recvReqRetry()
|
|
{
|
|
int len = retries.size();
|
|
|
|
DPRINTF(GPUTLB, "CU%d: DTLB recvReqRetry - %d pending requests\n",
|
|
computeUnit->cu_id, len);
|
|
|
|
assert(len > 0);
|
|
assert(isStalled());
|
|
// recvReqRetry is an indication that the resource on which this
|
|
// port was stalling on is freed. So, remove the stall first
|
|
unstallPort();
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
PacketPtr pkt = retries.front();
|
|
Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
|
|
DPRINTF(GPUTLB, "CU%d: retrying D-translaton for address%#x", vaddr);
|
|
|
|
if (!sendTimingReq(pkt)) {
|
|
// Stall port
|
|
stallPort();
|
|
DPRINTF(GPUTLB, ": failed again\n");
|
|
break;
|
|
} else {
|
|
DPRINTF(GPUTLB, ": successful\n");
|
|
retries.pop_front();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
ComputeUnit::ITLBPort::recvTimingResp(PacketPtr pkt)
|
|
{
|
|
Addr line M5_VAR_USED = pkt->req->getPaddr();
|
|
DPRINTF(GPUTLB, "CU%d: ITLBPort received %#x->%#x\n",
|
|
computeUnit->cu_id, pkt->req->getVaddr(), line);
|
|
|
|
assert(pkt->senderState);
|
|
|
|
// pop off the TLB translation state
|
|
TheISA::GpuTLB::TranslationState *translation_state =
|
|
safe_cast<TheISA::GpuTLB::TranslationState*>(pkt->senderState);
|
|
|
|
bool success = translation_state->tlbEntry->valid;
|
|
delete translation_state->tlbEntry;
|
|
assert(!translation_state->ports.size());
|
|
pkt->senderState = translation_state->saved;
|
|
delete translation_state;
|
|
|
|
// use the original sender state to know how to close this transaction
|
|
ITLBPort::SenderState *sender_state =
|
|
safe_cast<ITLBPort::SenderState*>(pkt->senderState);
|
|
|
|
// get the wavefront associated with this translation request
|
|
Wavefront *wavefront = sender_state->wavefront;
|
|
delete pkt->senderState;
|
|
|
|
if (success) {
|
|
// pkt is reused in fetch(), don't delete it here. However, we must
|
|
// reset the command to be a request so that it can be sent through
|
|
// the cu's master port
|
|
assert(pkt->cmd == MemCmd::ReadResp);
|
|
pkt->cmd = MemCmd::ReadReq;
|
|
|
|
computeUnit->fetchStage.fetch(pkt, wavefront);
|
|
} else {
|
|
if (wavefront->dropFetch) {
|
|
assert(wavefront->instructionBuffer.empty());
|
|
wavefront->dropFetch = false;
|
|
}
|
|
|
|
wavefront->pendingFetch = 0;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* The initial translation request could have been rejected, if
|
|
* <retries> queue is not empty. Retry sending the translation
|
|
* request. sendRetry() is called from the peer port whenever
|
|
* a translation completes.
|
|
*/
|
|
void
|
|
ComputeUnit::ITLBPort::recvReqRetry()
|
|
{
|
|
|
|
int len = retries.size();
|
|
DPRINTF(GPUTLB, "CU%d: ITLB recvReqRetry - %d pending requests\n", len);
|
|
|
|
assert(len > 0);
|
|
assert(isStalled());
|
|
|
|
// recvReqRetry is an indication that the resource on which this
|
|
// port was stalling on is freed. So, remove the stall first
|
|
unstallPort();
|
|
|
|
for (int i = 0; i < len; ++i) {
|
|
PacketPtr pkt = retries.front();
|
|
Addr vaddr M5_VAR_USED = pkt->req->getVaddr();
|
|
DPRINTF(GPUTLB, "CU%d: retrying I-translaton for address%#x", vaddr);
|
|
|
|
if (!sendTimingReq(pkt)) {
|
|
stallPort(); // Stall port
|
|
DPRINTF(GPUTLB, ": failed again\n");
|
|
break;
|
|
} else {
|
|
DPRINTF(GPUTLB, ": successful\n");
|
|
retries.pop_front();
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
ComputeUnit::regStats()
|
|
{
|
|
MemObject::regStats();
|
|
|
|
vALUInsts
|
|
.name(name() + ".valu_insts")
|
|
.desc("Number of vector ALU insts issued.")
|
|
;
|
|
vALUInstsPerWF
|
|
.name(name() + ".valu_insts_per_wf")
|
|
.desc("The avg. number of vector ALU insts issued per-wavefront.")
|
|
;
|
|
sALUInsts
|
|
.name(name() + ".salu_insts")
|
|
.desc("Number of scalar ALU insts issued.")
|
|
;
|
|
sALUInstsPerWF
|
|
.name(name() + ".salu_insts_per_wf")
|
|
.desc("The avg. number of scalar ALU insts issued per-wavefront.")
|
|
;
|
|
instCyclesVALU
|
|
.name(name() + ".inst_cycles_valu")
|
|
.desc("Number of cycles needed to execute VALU insts.")
|
|
;
|
|
instCyclesSALU
|
|
.name(name() + ".inst_cycles_salu")
|
|
.desc("Number of cycles needed to execute SALU insts.")
|
|
;
|
|
threadCyclesVALU
|
|
.name(name() + ".thread_cycles_valu")
|
|
.desc("Number of thread cycles used to execute vector ALU ops. "
|
|
"Similar to instCyclesVALU but multiplied by the number of "
|
|
"active threads.")
|
|
;
|
|
vALUUtilization
|
|
.name(name() + ".valu_utilization")
|
|
.desc("Percentage of active vector ALU threads in a wave.")
|
|
;
|
|
ldsNoFlatInsts
|
|
.name(name() + ".lds_no_flat_insts")
|
|
.desc("Number of LDS insts issued, not including FLAT "
|
|
"accesses that resolve to LDS.")
|
|
;
|
|
ldsNoFlatInstsPerWF
|
|
.name(name() + ".lds_no_flat_insts_per_wf")
|
|
.desc("The avg. number of LDS insts (not including FLAT "
|
|
"accesses that resolve to LDS) per-wavefront.")
|
|
;
|
|
flatVMemInsts
|
|
.name(name() + ".flat_vmem_insts")
|
|
.desc("The number of FLAT insts that resolve to vmem issued.")
|
|
;
|
|
flatVMemInstsPerWF
|
|
.name(name() + ".flat_vmem_insts_per_wf")
|
|
.desc("The average number of FLAT insts that resolve to vmem "
|
|
"issued per-wavefront.")
|
|
;
|
|
flatLDSInsts
|
|
.name(name() + ".flat_lds_insts")
|
|
.desc("The number of FLAT insts that resolve to LDS issued.")
|
|
;
|
|
flatLDSInstsPerWF
|
|
.name(name() + ".flat_lds_insts_per_wf")
|
|
.desc("The average number of FLAT insts that resolve to LDS "
|
|
"issued per-wavefront.")
|
|
;
|
|
vectorMemWrites
|
|
.name(name() + ".vector_mem_writes")
|
|
.desc("Number of vector mem write insts (excluding FLAT insts).")
|
|
;
|
|
vectorMemWritesPerWF
|
|
.name(name() + ".vector_mem_writes_per_wf")
|
|
.desc("The average number of vector mem write insts "
|
|
"(excluding FLAT insts) per-wavefront.")
|
|
;
|
|
vectorMemReads
|
|
.name(name() + ".vector_mem_reads")
|
|
.desc("Number of vector mem read insts (excluding FLAT insts).")
|
|
;
|
|
vectorMemReadsPerWF
|
|
.name(name() + ".vector_mem_reads_per_wf")
|
|
.desc("The avg. number of vector mem read insts (excluding "
|
|
"FLAT insts) per-wavefront.")
|
|
;
|
|
scalarMemWrites
|
|
.name(name() + ".scalar_mem_writes")
|
|
.desc("Number of scalar mem write insts.")
|
|
;
|
|
scalarMemWritesPerWF
|
|
.name(name() + ".scalar_mem_writes_per_wf")
|
|
.desc("The average number of scalar mem write insts per-wavefront.")
|
|
;
|
|
scalarMemReads
|
|
.name(name() + ".scalar_mem_reads")
|
|
.desc("Number of scalar mem read insts.")
|
|
;
|
|
scalarMemReadsPerWF
|
|
.name(name() + ".scalar_mem_reads_per_wf")
|
|
.desc("The average number of scalar mem read insts per-wavefront.")
|
|
;
|
|
|
|
vALUInstsPerWF = vALUInsts / completedWfs;
|
|
sALUInstsPerWF = sALUInsts / completedWfs;
|
|
vALUUtilization = (threadCyclesVALU / (64 * instCyclesVALU)) * 100;
|
|
ldsNoFlatInstsPerWF = ldsNoFlatInsts / completedWfs;
|
|
flatVMemInstsPerWF = flatVMemInsts / completedWfs;
|
|
flatLDSInstsPerWF = flatLDSInsts / completedWfs;
|
|
vectorMemWritesPerWF = vectorMemWrites / completedWfs;
|
|
vectorMemReadsPerWF = vectorMemReads / completedWfs;
|
|
scalarMemWritesPerWF = scalarMemWrites / completedWfs;
|
|
scalarMemReadsPerWF = scalarMemReads / completedWfs;
|
|
|
|
tlbCycles
|
|
.name(name() + ".tlb_cycles")
|
|
.desc("total number of cycles for all uncoalesced requests")
|
|
;
|
|
|
|
tlbRequests
|
|
.name(name() + ".tlb_requests")
|
|
.desc("number of uncoalesced requests")
|
|
;
|
|
|
|
tlbLatency
|
|
.name(name() + ".avg_translation_latency")
|
|
.desc("Avg. translation latency for data translations")
|
|
;
|
|
|
|
tlbLatency = tlbCycles / tlbRequests;
|
|
|
|
hitsPerTLBLevel
|
|
.init(4)
|
|
.name(name() + ".TLB_hits_distribution")
|
|
.desc("TLB hits distribution (0 for page table, x for Lx-TLB")
|
|
;
|
|
|
|
// fixed number of TLB levels
|
|
for (int i = 0; i < 4; ++i) {
|
|
if (!i)
|
|
hitsPerTLBLevel.subname(i,"page_table");
|
|
else
|
|
hitsPerTLBLevel.subname(i, csprintf("L%d_TLB",i));
|
|
}
|
|
|
|
execRateDist
|
|
.init(0, 10, 2)
|
|
.name(name() + ".inst_exec_rate")
|
|
.desc("Instruction Execution Rate: Number of executed vector "
|
|
"instructions per cycle")
|
|
;
|
|
|
|
ldsBankConflictDist
|
|
.init(0, wfSize(), 2)
|
|
.name(name() + ".lds_bank_conflicts")
|
|
.desc("Number of bank conflicts per LDS memory packet")
|
|
;
|
|
|
|
ldsBankAccesses
|
|
.name(name() + ".lds_bank_access_cnt")
|
|
.desc("Total number of LDS bank accesses")
|
|
;
|
|
|
|
pageDivergenceDist
|
|
// A wavefront can touch up to N pages per memory instruction where
|
|
// N is equal to the wavefront size
|
|
// The number of pages per bin can be configured (here it's 4).
|
|
.init(1, wfSize(), 4)
|
|
.name(name() + ".page_divergence_dist")
|
|
.desc("pages touched per wf (over all mem. instr.)")
|
|
;
|
|
|
|
controlFlowDivergenceDist
|
|
.init(1, wfSize(), 4)
|
|
.name(name() + ".warp_execution_dist")
|
|
.desc("number of lanes active per instruction (oval all instructions)")
|
|
;
|
|
|
|
activeLanesPerGMemInstrDist
|
|
.init(1, wfSize(), 4)
|
|
.name(name() + ".gmem_lanes_execution_dist")
|
|
.desc("number of active lanes per global memory instruction")
|
|
;
|
|
|
|
activeLanesPerLMemInstrDist
|
|
.init(1, wfSize(), 4)
|
|
.name(name() + ".lmem_lanes_execution_dist")
|
|
.desc("number of active lanes per local memory instruction")
|
|
;
|
|
|
|
numInstrExecuted
|
|
.name(name() + ".num_instr_executed")
|
|
.desc("number of instructions executed")
|
|
;
|
|
|
|
numVecOpsExecuted
|
|
.name(name() + ".num_vec_ops_executed")
|
|
.desc("number of vec ops executed (e.g. WF size/inst)")
|
|
;
|
|
|
|
totalCycles
|
|
.name(name() + ".num_total_cycles")
|
|
.desc("number of cycles the CU ran for")
|
|
;
|
|
|
|
ipc
|
|
.name(name() + ".ipc")
|
|
.desc("Instructions per cycle (this CU only)")
|
|
;
|
|
|
|
vpc
|
|
.name(name() + ".vpc")
|
|
.desc("Vector Operations per cycle (this CU only)")
|
|
;
|
|
|
|
numALUInstsExecuted
|
|
.name(name() + ".num_alu_insts_executed")
|
|
.desc("Number of dynamic non-GM memory insts executed")
|
|
;
|
|
|
|
wgBlockedDueLdsAllocation
|
|
.name(name() + ".wg_blocked_due_lds_alloc")
|
|
.desc("Workgroup blocked due to LDS capacity")
|
|
;
|
|
|
|
ipc = numInstrExecuted / totalCycles;
|
|
vpc = numVecOpsExecuted / totalCycles;
|
|
|
|
numTimesWgBlockedDueVgprAlloc
|
|
.name(name() + ".times_wg_blocked_due_vgpr_alloc")
|
|
.desc("Number of times WGs are blocked due to VGPR allocation per SIMD")
|
|
;
|
|
|
|
dynamicGMemInstrCnt
|
|
.name(name() + ".global_mem_instr_cnt")
|
|
.desc("dynamic global memory instructions count")
|
|
;
|
|
|
|
dynamicLMemInstrCnt
|
|
.name(name() + ".local_mem_instr_cnt")
|
|
.desc("dynamic local memory intruction count")
|
|
;
|
|
|
|
numALUInstsExecuted = numInstrExecuted - dynamicGMemInstrCnt -
|
|
dynamicLMemInstrCnt;
|
|
|
|
completedWfs
|
|
.name(name() + ".num_completed_wfs")
|
|
.desc("number of completed wavefronts")
|
|
;
|
|
|
|
numCASOps
|
|
.name(name() + ".num_CAS_ops")
|
|
.desc("number of compare and swap operations")
|
|
;
|
|
|
|
numFailedCASOps
|
|
.name(name() + ".num_failed_CAS_ops")
|
|
.desc("number of compare and swap operations that failed")
|
|
;
|
|
|
|
// register stats of pipeline stages
|
|
fetchStage.regStats();
|
|
scoreboardCheckStage.regStats();
|
|
scheduleStage.regStats();
|
|
execStage.regStats();
|
|
|
|
// register stats of memory pipeline
|
|
globalMemoryPipe.regStats();
|
|
localMemoryPipe.regStats();
|
|
}
|
|
|
|
void
|
|
ComputeUnit::updateInstStats(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
if (gpuDynInst->isScalar()) {
|
|
if (gpuDynInst->isALU() && !gpuDynInst->isWaitcnt()) {
|
|
sALUInsts++;
|
|
instCyclesSALU++;
|
|
} else if (gpuDynInst->isLoad()) {
|
|
scalarMemReads++;
|
|
} else if (gpuDynInst->isStore()) {
|
|
scalarMemWrites++;
|
|
}
|
|
} else {
|
|
if (gpuDynInst->isALU()) {
|
|
vALUInsts++;
|
|
instCyclesVALU++;
|
|
threadCyclesVALU += gpuDynInst->wavefront()->execMask().count();
|
|
} else if (gpuDynInst->isFlat()) {
|
|
if (gpuDynInst->isLocalMem()) {
|
|
flatLDSInsts++;
|
|
} else {
|
|
flatVMemInsts++;
|
|
}
|
|
} else if (gpuDynInst->isLocalMem()) {
|
|
ldsNoFlatInsts++;
|
|
} else if (gpuDynInst->isLoad()) {
|
|
vectorMemReads++;
|
|
} else if (gpuDynInst->isStore()) {
|
|
vectorMemWrites++;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
ComputeUnit::updatePageDivergenceDist(Addr addr)
|
|
{
|
|
Addr virt_page_addr = roundDown(addr, TheISA::PageBytes);
|
|
|
|
if (!pagesTouched.count(virt_page_addr))
|
|
pagesTouched[virt_page_addr] = 1;
|
|
else
|
|
pagesTouched[virt_page_addr]++;
|
|
}
|
|
|
|
void
|
|
ComputeUnit::CUExitCallback::process()
|
|
{
|
|
if (computeUnit->countPages) {
|
|
std::ostream *page_stat_file =
|
|
simout.create(computeUnit->name().c_str())->stream();
|
|
|
|
*page_stat_file << "page, wavefront accesses, workitem accesses" <<
|
|
std::endl;
|
|
|
|
for (auto iter : computeUnit->pageAccesses) {
|
|
*page_stat_file << std::hex << iter.first << ",";
|
|
*page_stat_file << std::dec << iter.second.first << ",";
|
|
*page_stat_file << std::dec << iter.second.second << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
ComputeUnit::isDone() const
|
|
{
|
|
for (int i = 0; i < numSIMDs; ++i) {
|
|
if (!isSimdDone(i)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool glbMemBusRdy = true;
|
|
for (int j = 0; j < numGlbMemUnits; ++j) {
|
|
glbMemBusRdy &= vrfToGlobalMemPipeBus[j].rdy();
|
|
}
|
|
bool locMemBusRdy = true;
|
|
for (int j = 0; j < numLocMemUnits; ++j) {
|
|
locMemBusRdy &= vrfToLocalMemPipeBus[j].rdy();
|
|
}
|
|
|
|
if (!globalMemoryPipe.isGMLdRespFIFOWrRdy() ||
|
|
!globalMemoryPipe.isGMStRespFIFOWrRdy() ||
|
|
!globalMemoryPipe.isGMReqFIFOWrRdy() || !localMemoryPipe.isLMReqFIFOWrRdy()
|
|
|| !localMemoryPipe.isLMRespFIFOWrRdy() || !locMemToVrfBus.rdy() ||
|
|
!glbMemToVrfBus.rdy() || !locMemBusRdy || !glbMemBusRdy) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
int32_t
|
|
ComputeUnit::getRefCounter(const uint32_t dispatchId, const uint32_t wgId) const
|
|
{
|
|
return lds.getRefCounter(dispatchId, wgId);
|
|
}
|
|
|
|
bool
|
|
ComputeUnit::isSimdDone(uint32_t simdId) const
|
|
{
|
|
assert(simdId < numSIMDs);
|
|
|
|
for (int i=0; i < numGlbMemUnits; ++i) {
|
|
if (!vrfToGlobalMemPipeBus[i].rdy())
|
|
return false;
|
|
}
|
|
for (int i=0; i < numLocMemUnits; ++i) {
|
|
if (!vrfToLocalMemPipeBus[i].rdy())
|
|
return false;
|
|
}
|
|
if (!aluPipe[simdId].rdy()) {
|
|
return false;
|
|
}
|
|
|
|
for (int i_wf = 0; i_wf < shader->n_wf; ++i_wf){
|
|
if (wfList[simdId][i_wf]->status != Wavefront::S_STOPPED) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* send a general request to the LDS
|
|
* make sure to look at the return value here as your request might be
|
|
* NACK'd and returning false means that you have to have some backup plan
|
|
*/
|
|
bool
|
|
ComputeUnit::sendToLds(GPUDynInstPtr gpuDynInst)
|
|
{
|
|
// this is just a request to carry the GPUDynInstPtr
|
|
// back and forth
|
|
Request *newRequest = new Request();
|
|
newRequest->setPaddr(0x0);
|
|
|
|
// ReadReq is not evaluted by the LDS but the Packet ctor requires this
|
|
PacketPtr newPacket = new Packet(newRequest, MemCmd::ReadReq);
|
|
|
|
// This is the SenderState needed upon return
|
|
newPacket->senderState = new LDSPort::SenderState(gpuDynInst);
|
|
|
|
return ldsPort->sendTimingReq(newPacket);
|
|
}
|
|
|
|
/**
|
|
* get the result of packets sent to the LDS when they return
|
|
*/
|
|
bool
|
|
ComputeUnit::LDSPort::recvTimingResp(PacketPtr packet)
|
|
{
|
|
const ComputeUnit::LDSPort::SenderState *senderState =
|
|
dynamic_cast<ComputeUnit::LDSPort::SenderState *>(packet->senderState);
|
|
|
|
fatal_if(!senderState, "did not get the right sort of sender state");
|
|
|
|
GPUDynInstPtr gpuDynInst = senderState->getMemInst();
|
|
|
|
delete packet->senderState;
|
|
delete packet->req;
|
|
delete packet;
|
|
|
|
computeUnit->localMemoryPipe.getLMRespFIFO().push(gpuDynInst);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* attempt to send this packet, either the port is already stalled, the request
|
|
* is nack'd and must stall or the request goes through
|
|
* when a request cannot be sent, add it to the retries queue
|
|
*/
|
|
bool
|
|
ComputeUnit::LDSPort::sendTimingReq(PacketPtr pkt)
|
|
{
|
|
ComputeUnit::LDSPort::SenderState *sender_state =
|
|
dynamic_cast<ComputeUnit::LDSPort::SenderState*>(pkt->senderState);
|
|
fatal_if(!sender_state, "packet without a valid sender state");
|
|
|
|
GPUDynInstPtr gpuDynInst M5_VAR_USED = sender_state->getMemInst();
|
|
|
|
if (isStalled()) {
|
|
fatal_if(retries.empty(), "must have retries waiting to be stalled");
|
|
|
|
retries.push(pkt);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: LDS send failed!\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId);
|
|
return false;
|
|
} else if (!MasterPort::sendTimingReq(pkt)) {
|
|
// need to stall the LDS port until a recvReqRetry() is received
|
|
// this indicates that there is more space
|
|
stallPort();
|
|
retries.push(pkt);
|
|
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req failed!\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, pkt->req->getPaddr());
|
|
return false;
|
|
} else {
|
|
DPRINTF(GPUPort, "CU%d: WF[%d][%d]: addr %#x lds req sent!\n",
|
|
computeUnit->cu_id, gpuDynInst->simdId,
|
|
gpuDynInst->wfSlotId, pkt->req->getPaddr());
|
|
return true;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* the bus is telling the port that there is now space so retrying stalled
|
|
* requests should work now
|
|
* this allows the port to have a request be nack'd and then have the receiver
|
|
* say when there is space, rather than simply retrying the send every cycle
|
|
*/
|
|
void
|
|
ComputeUnit::LDSPort::recvReqRetry()
|
|
{
|
|
auto queueSize = retries.size();
|
|
|
|
DPRINTF(GPUPort, "CU%d: LDSPort recvReqRetry - %d pending requests\n",
|
|
computeUnit->cu_id, queueSize);
|
|
|
|
fatal_if(queueSize < 1,
|
|
"why was there a recvReqRetry() with no pending reqs?");
|
|
fatal_if(!isStalled(),
|
|
"recvReqRetry() happened when the port was not stalled");
|
|
|
|
unstallPort();
|
|
|
|
while (!retries.empty()) {
|
|
PacketPtr packet = retries.front();
|
|
|
|
DPRINTF(GPUPort, "CU%d: retrying LDS send\n", computeUnit->cu_id);
|
|
|
|
if (!MasterPort::sendTimingReq(packet)) {
|
|
// Stall port
|
|
stallPort();
|
|
DPRINTF(GPUPort, ": LDS send failed again\n");
|
|
break;
|
|
} else {
|
|
DPRINTF(GPUTLB, ": LDS send successful\n");
|
|
retries.pop();
|
|
}
|
|
}
|
|
}
|