gpu-compute: Remove WFContext
WFContext struct is currently unused and it has been rendered not useful in saving and restoring the context of a Wavefront. Wavefront class should be sufficient for that purpose and the runtime can figure out the memory size it will need to allocate for a Wavefront through an IOCTL.
This commit is contained in:
parent
ada0e2f02f
commit
e9fe1b838b
3 changed files with 30 additions and 130 deletions
|
@ -192,50 +192,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
|
|||
w->roSize = ndr->q.roMemTotal;
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
|
||||
int trueWgSize[], int trueWgSizeTotal,
|
||||
LdsChunk *ldsChunk, uint64_t origSpillMemStart)
|
||||
{
|
||||
wfCtx->cnt = cnt;
|
||||
|
||||
VectorMask init_mask;
|
||||
init_mask.reset();
|
||||
|
||||
for (int k = 0; k < wfSize(); ++k) {
|
||||
if (k + cnt * wfSize() < trueWgSizeTotal)
|
||||
init_mask[k] = 1;
|
||||
}
|
||||
|
||||
wfCtx->init_mask = init_mask.to_ullong();
|
||||
wfCtx->exec_mask = init_mask.to_ullong();
|
||||
|
||||
wfCtx->bar_cnt.resize(wfSize(), 0);
|
||||
|
||||
wfCtx->max_bar_cnt = 0;
|
||||
wfCtx->old_barrier_cnt = 0;
|
||||
wfCtx->barrier_cnt = 0;
|
||||
|
||||
wfCtx->privBase = ndr->q.privMemStart;
|
||||
ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
|
||||
|
||||
wfCtx->spillBase = ndr->q.spillMemStart;
|
||||
ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
|
||||
|
||||
wfCtx->pc = 0;
|
||||
wfCtx->rpc = UINT32_MAX;
|
||||
|
||||
// set the wavefront context to have a pointer to this section of the LDS
|
||||
wfCtx->ldsChunk = ldsChunk;
|
||||
|
||||
// WG state
|
||||
wfCtx->wg_id = ndr->globalWgId;
|
||||
wfCtx->barrier_id = barrier_id;
|
||||
|
||||
// Kernel wide state
|
||||
wfCtx->ndr = ndr;
|
||||
}
|
||||
|
||||
void
|
||||
ComputeUnit::updateEvents() {
|
||||
|
||||
|
@ -264,19 +220,25 @@ ComputeUnit::updateEvents() {
|
|||
|
||||
|
||||
void
|
||||
ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||
int trueWgSizeTotal)
|
||||
ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
|
||||
int cnt, LdsChunk *ldsChunk, NDRange *ndr)
|
||||
{
|
||||
static int _n_wave = 0;
|
||||
int cnt = wfCtx->cnt;
|
||||
NDRange *ndr = wfCtx->ndr;
|
||||
|
||||
// Fill in Kernel state
|
||||
FillKernelState(w, ndr);
|
||||
|
||||
VectorMask init_mask;
|
||||
init_mask.reset();
|
||||
|
||||
for (int k = 0; k < wfSize(); ++k) {
|
||||
if (k + cnt * wfSize() < trueWgSizeTotal)
|
||||
init_mask[k] = 1;
|
||||
}
|
||||
|
||||
w->kern_id = ndr->dispatchId;
|
||||
w->dynwaveid = cnt;
|
||||
w->init_mask = wfCtx->init_mask;
|
||||
w->init_mask = init_mask.to_ullong();
|
||||
|
||||
for (int k = 0; k < wfSize(); ++k) {
|
||||
w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
|
||||
|
@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
|||
w->workitemid[0][k];
|
||||
}
|
||||
|
||||
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
|
||||
w->barrier_cnt = wfCtx->barrier_cnt;
|
||||
w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
|
||||
|
||||
for (int i = 0; i < wfSize(); ++i) {
|
||||
w->bar_cnt[i] = wfCtx->bar_cnt[i];
|
||||
}
|
||||
w->bar_cnt.resize(wfSize(), 0);
|
||||
|
||||
w->max_bar_cnt = wfCtx->max_bar_cnt;
|
||||
w->privBase = wfCtx->privBase;
|
||||
w->spillBase = wfCtx->spillBase;
|
||||
w->max_bar_cnt = 0;
|
||||
w->old_barrier_cnt = 0;
|
||||
w->barrier_cnt = 0;
|
||||
|
||||
w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
|
||||
w->privBase = ndr->q.privMemStart;
|
||||
ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
|
||||
|
||||
w->spillBase = ndr->q.spillMemStart;
|
||||
ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
|
||||
|
||||
w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
|
||||
|
||||
// WG state
|
||||
w->wg_id = wfCtx->wg_id;
|
||||
w->dispatchid = wfCtx->ndr->dispatchId;
|
||||
w->wg_id = ndr->globalWgId;
|
||||
w->dispatchid = ndr->dispatchId;
|
||||
w->workgroupid[0] = w->wg_id % ndr->numWg[0];
|
||||
w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
|
||||
w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
|
||||
|
||||
w->barrier_id = wfCtx->barrier_id;
|
||||
w->barrier_id = barrier_id;
|
||||
w->stalledAtBarrier = false;
|
||||
|
||||
// move this from the context into the actual wavefront
|
||||
w->ldsChunk = wfCtx->ldsChunk;
|
||||
// set the wavefront context to have a pointer to this section of the LDS
|
||||
w->ldsChunk = ldsChunk;
|
||||
|
||||
int32_t refCount M5_VAR_USED =
|
||||
lds.increaseRefCounter(w->dispatchid, w->wg_id);
|
||||
|
@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
|||
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
|
||||
|
||||
w->start(++_n_wave, ndr->q.code_ptr);
|
||||
wfCtx->bar_cnt.clear();
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
|
|||
trueWgSizeTotal *= trueWgSize[d];
|
||||
}
|
||||
|
||||
uint64_t origSpillMemStart = ndr->q.spillMemStart;
|
||||
// calculate the number of 32-bit vector registers required by wavefront
|
||||
int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
||||
int cnt = 0;
|
||||
|
@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
|
|||
w->reservedVectorRegs = normSize;
|
||||
vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
|
||||
|
||||
WFContext wfCtx;
|
||||
|
||||
InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
|
||||
ldsChunk, origSpillMemStart);
|
||||
|
||||
StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
|
||||
StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
|
||||
++cnt;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -256,12 +256,8 @@ class ComputeUnit : public MemObject
|
|||
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
||||
void FillKernelState(Wavefront *w, NDRange *ndr);
|
||||
|
||||
void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||
int trueWgSizeTotal);
|
||||
|
||||
void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
|
||||
int trueWgSize[], int trueWgSizeTotal,
|
||||
LdsChunk *ldsChunk, uint64_t origSpillMemStart);
|
||||
void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
|
||||
int cnt, LdsChunk *ldsChunk, NDRange *ndr);
|
||||
|
||||
void StartWorkgroup(NDRange *ndr);
|
||||
int ReadyWorkgroup(NDRange *ndr);
|
||||
|
|
|
@ -95,59 +95,6 @@ struct HsaQueueEntry
|
|||
uint16_t num_args;
|
||||
};
|
||||
|
||||
// State used to start (or restart) a WF
|
||||
struct WFContext
|
||||
{
|
||||
// 32 bit values
|
||||
// barrier state
|
||||
std::vector<int> bar_cnt;
|
||||
|
||||
// id (which WF in the WG)
|
||||
int cnt;
|
||||
|
||||
// more barrier state
|
||||
int max_bar_cnt;
|
||||
int old_barrier_cnt;
|
||||
int barrier_cnt;
|
||||
|
||||
// More Program Counter Stuff
|
||||
uint32_t pc;
|
||||
|
||||
// Program counter of the immediate post-dominator instruction
|
||||
uint32_t rpc;
|
||||
|
||||
// WG wide state (I don't see how to avoid redundancy here)
|
||||
int cu_id;
|
||||
uint32_t wg_id;
|
||||
uint32_t barrier_id;
|
||||
|
||||
// 64 bit values (these values depend on the wavefront size)
|
||||
// masks
|
||||
uint64_t init_mask;
|
||||
uint64_t exec_mask;
|
||||
|
||||
// private memory;
|
||||
Addr privBase;
|
||||
Addr spillBase;
|
||||
|
||||
LdsChunk *ldsChunk;
|
||||
|
||||
/*
|
||||
* Kernel wide state
|
||||
* This is a hack. This state should be moved through simulated memory
|
||||
* during a yield. Though not much is being used here, so it's probably
|
||||
* probably not a big deal.
|
||||
*
|
||||
* Just to add to this comment... The ndr is derived from simulated
|
||||
* memory when the cl-runtime allocates an HsaQueueEntry and populates it
|
||||
* for a kernel launch. So in theory the runtime should be able to keep
|
||||
* that state around. Then a WF can reference it upon restart to derive
|
||||
* kernel wide state. The runtime can deallocate the state when the
|
||||
* kernel completes.
|
||||
*/
|
||||
NDRange *ndr;
|
||||
};
|
||||
|
||||
// State that needs to be passed between the simulation and simulated app, a
|
||||
// pointer to this struct can be passed through the depends field in the
|
||||
// HsaQueueEntry struct
|
||||
|
|
Loading…
Reference in a new issue