gpu-compute: Remove WFContext
WFContext struct is currently unused and it has been rendered not useful in saving and restoring the context of a Wavefront. Wavefront class should be sufficient for that purpose and the runtime can figure out the memory size it will need to allocate for a Wavefront through an IOCTL.
This commit is contained in:
parent
ada0e2f02f
commit
e9fe1b838b
3 changed files with 30 additions and 130 deletions
|
@ -192,50 +192,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
|
||||||
w->roSize = ndr->q.roMemTotal;
|
w->roSize = ndr->q.roMemTotal;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
|
|
||||||
int trueWgSize[], int trueWgSizeTotal,
|
|
||||||
LdsChunk *ldsChunk, uint64_t origSpillMemStart)
|
|
||||||
{
|
|
||||||
wfCtx->cnt = cnt;
|
|
||||||
|
|
||||||
VectorMask init_mask;
|
|
||||||
init_mask.reset();
|
|
||||||
|
|
||||||
for (int k = 0; k < wfSize(); ++k) {
|
|
||||||
if (k + cnt * wfSize() < trueWgSizeTotal)
|
|
||||||
init_mask[k] = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
wfCtx->init_mask = init_mask.to_ullong();
|
|
||||||
wfCtx->exec_mask = init_mask.to_ullong();
|
|
||||||
|
|
||||||
wfCtx->bar_cnt.resize(wfSize(), 0);
|
|
||||||
|
|
||||||
wfCtx->max_bar_cnt = 0;
|
|
||||||
wfCtx->old_barrier_cnt = 0;
|
|
||||||
wfCtx->barrier_cnt = 0;
|
|
||||||
|
|
||||||
wfCtx->privBase = ndr->q.privMemStart;
|
|
||||||
ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
|
|
||||||
|
|
||||||
wfCtx->spillBase = ndr->q.spillMemStart;
|
|
||||||
ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
|
|
||||||
|
|
||||||
wfCtx->pc = 0;
|
|
||||||
wfCtx->rpc = UINT32_MAX;
|
|
||||||
|
|
||||||
// set the wavefront context to have a pointer to this section of the LDS
|
|
||||||
wfCtx->ldsChunk = ldsChunk;
|
|
||||||
|
|
||||||
// WG state
|
|
||||||
wfCtx->wg_id = ndr->globalWgId;
|
|
||||||
wfCtx->barrier_id = barrier_id;
|
|
||||||
|
|
||||||
// Kernel wide state
|
|
||||||
wfCtx->ndr = ndr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ComputeUnit::updateEvents() {
|
ComputeUnit::updateEvents() {
|
||||||
|
|
||||||
|
@ -264,19 +220,25 @@ ComputeUnit::updateEvents() {
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
|
||||||
int trueWgSizeTotal)
|
int cnt, LdsChunk *ldsChunk, NDRange *ndr)
|
||||||
{
|
{
|
||||||
static int _n_wave = 0;
|
static int _n_wave = 0;
|
||||||
int cnt = wfCtx->cnt;
|
|
||||||
NDRange *ndr = wfCtx->ndr;
|
|
||||||
|
|
||||||
// Fill in Kernel state
|
// Fill in Kernel state
|
||||||
FillKernelState(w, ndr);
|
FillKernelState(w, ndr);
|
||||||
|
|
||||||
|
VectorMask init_mask;
|
||||||
|
init_mask.reset();
|
||||||
|
|
||||||
|
for (int k = 0; k < wfSize(); ++k) {
|
||||||
|
if (k + cnt * wfSize() < trueWgSizeTotal)
|
||||||
|
init_mask[k] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
w->kern_id = ndr->dispatchId;
|
w->kern_id = ndr->dispatchId;
|
||||||
w->dynwaveid = cnt;
|
w->dynwaveid = cnt;
|
||||||
w->init_mask = wfCtx->init_mask;
|
w->init_mask = init_mask.to_ullong();
|
||||||
|
|
||||||
for (int k = 0; k < wfSize(); ++k) {
|
for (int k = 0; k < wfSize(); ++k) {
|
||||||
w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
|
w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
|
||||||
|
@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||||
w->workitemid[0][k];
|
w->workitemid[0][k];
|
||||||
}
|
}
|
||||||
|
|
||||||
w->old_barrier_cnt = wfCtx->old_barrier_cnt;
|
|
||||||
w->barrier_cnt = wfCtx->barrier_cnt;
|
|
||||||
w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
|
w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
|
||||||
|
|
||||||
for (int i = 0; i < wfSize(); ++i) {
|
w->bar_cnt.resize(wfSize(), 0);
|
||||||
w->bar_cnt[i] = wfCtx->bar_cnt[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
w->max_bar_cnt = wfCtx->max_bar_cnt;
|
w->max_bar_cnt = 0;
|
||||||
w->privBase = wfCtx->privBase;
|
w->old_barrier_cnt = 0;
|
||||||
w->spillBase = wfCtx->spillBase;
|
w->barrier_cnt = 0;
|
||||||
|
|
||||||
w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
|
w->privBase = ndr->q.privMemStart;
|
||||||
|
ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
|
||||||
|
|
||||||
|
w->spillBase = ndr->q.spillMemStart;
|
||||||
|
ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
|
||||||
|
|
||||||
|
w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
|
||||||
|
|
||||||
// WG state
|
// WG state
|
||||||
w->wg_id = wfCtx->wg_id;
|
w->wg_id = ndr->globalWgId;
|
||||||
w->dispatchid = wfCtx->ndr->dispatchId;
|
w->dispatchid = ndr->dispatchId;
|
||||||
w->workgroupid[0] = w->wg_id % ndr->numWg[0];
|
w->workgroupid[0] = w->wg_id % ndr->numWg[0];
|
||||||
w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
|
w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
|
||||||
w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
|
w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
|
||||||
|
|
||||||
w->barrier_id = wfCtx->barrier_id;
|
w->barrier_id = barrier_id;
|
||||||
w->stalledAtBarrier = false;
|
w->stalledAtBarrier = false;
|
||||||
|
|
||||||
// move this from the context into the actual wavefront
|
// set the wavefront context to have a pointer to this section of the LDS
|
||||||
w->ldsChunk = wfCtx->ldsChunk;
|
w->ldsChunk = ldsChunk;
|
||||||
|
|
||||||
int32_t refCount M5_VAR_USED =
|
int32_t refCount M5_VAR_USED =
|
||||||
lds.increaseRefCounter(w->dispatchid, w->wg_id);
|
lds.increaseRefCounter(w->dispatchid, w->wg_id);
|
||||||
|
@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
||||||
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
|
"WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
|
||||||
|
|
||||||
w->start(++_n_wave, ndr->q.code_ptr);
|
w->start(++_n_wave, ndr->q.code_ptr);
|
||||||
wfCtx->bar_cnt.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
|
||||||
trueWgSizeTotal *= trueWgSize[d];
|
trueWgSizeTotal *= trueWgSize[d];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t origSpillMemStart = ndr->q.spillMemStart;
|
|
||||||
// calculate the number of 32-bit vector registers required by wavefront
|
// calculate the number of 32-bit vector registers required by wavefront
|
||||||
int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
|
||||||
w->reservedVectorRegs = normSize;
|
w->reservedVectorRegs = normSize;
|
||||||
vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
|
vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
|
||||||
|
|
||||||
WFContext wfCtx;
|
StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
|
||||||
|
|
||||||
InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
|
|
||||||
ldsChunk, origSpillMemStart);
|
|
||||||
|
|
||||||
StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
|
|
||||||
++cnt;
|
++cnt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -256,12 +256,8 @@ class ComputeUnit : public MemObject
|
||||||
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
void fetch(PacketPtr pkt, Wavefront *wavefront);
|
||||||
void FillKernelState(Wavefront *w, NDRange *ndr);
|
void FillKernelState(Wavefront *w, NDRange *ndr);
|
||||||
|
|
||||||
void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
|
void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
|
||||||
int trueWgSizeTotal);
|
int cnt, LdsChunk *ldsChunk, NDRange *ndr);
|
||||||
|
|
||||||
void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
|
|
||||||
int trueWgSize[], int trueWgSizeTotal,
|
|
||||||
LdsChunk *ldsChunk, uint64_t origSpillMemStart);
|
|
||||||
|
|
||||||
void StartWorkgroup(NDRange *ndr);
|
void StartWorkgroup(NDRange *ndr);
|
||||||
int ReadyWorkgroup(NDRange *ndr);
|
int ReadyWorkgroup(NDRange *ndr);
|
||||||
|
|
|
@ -95,59 +95,6 @@ struct HsaQueueEntry
|
||||||
uint16_t num_args;
|
uint16_t num_args;
|
||||||
};
|
};
|
||||||
|
|
||||||
// State used to start (or restart) a WF
|
|
||||||
struct WFContext
|
|
||||||
{
|
|
||||||
// 32 bit values
|
|
||||||
// barrier state
|
|
||||||
std::vector<int> bar_cnt;
|
|
||||||
|
|
||||||
// id (which WF in the WG)
|
|
||||||
int cnt;
|
|
||||||
|
|
||||||
// more barrier state
|
|
||||||
int max_bar_cnt;
|
|
||||||
int old_barrier_cnt;
|
|
||||||
int barrier_cnt;
|
|
||||||
|
|
||||||
// More Program Counter Stuff
|
|
||||||
uint32_t pc;
|
|
||||||
|
|
||||||
// Program counter of the immediate post-dominator instruction
|
|
||||||
uint32_t rpc;
|
|
||||||
|
|
||||||
// WG wide state (I don't see how to avoid redundancy here)
|
|
||||||
int cu_id;
|
|
||||||
uint32_t wg_id;
|
|
||||||
uint32_t barrier_id;
|
|
||||||
|
|
||||||
// 64 bit values (these values depend on the wavefront size)
|
|
||||||
// masks
|
|
||||||
uint64_t init_mask;
|
|
||||||
uint64_t exec_mask;
|
|
||||||
|
|
||||||
// private memory;
|
|
||||||
Addr privBase;
|
|
||||||
Addr spillBase;
|
|
||||||
|
|
||||||
LdsChunk *ldsChunk;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Kernel wide state
|
|
||||||
* This is a hack. This state should be moved through simulated memory
|
|
||||||
* during a yield. Though not much is being used here, so it's probably
|
|
||||||
* probably not a big deal.
|
|
||||||
*
|
|
||||||
* Just to add to this comment... The ndr is derived from simulated
|
|
||||||
* memory when the cl-runtime allocates an HsaQueueEntry and populates it
|
|
||||||
* for a kernel launch. So in theory the runtime should be able to keep
|
|
||||||
* that state around. Then a WF can reference it upon restart to derive
|
|
||||||
* kernel wide state. The runtime can deallocate the state when the
|
|
||||||
* kernel completes.
|
|
||||||
*/
|
|
||||||
NDRange *ndr;
|
|
||||||
};
|
|
||||||
|
|
||||||
// State that needs to be passed between the simulation and simulated app, a
|
// State that needs to be passed between the simulation and simulated app, a
|
||||||
// pointer to this struct can be passed through the depends field in the
|
// pointer to this struct can be passed through the depends field in the
|
||||||
// HsaQueueEntry struct
|
// HsaQueueEntry struct
|
||||||
|
|
Loading…
Reference in a new issue