diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 5ec061172..83e2414db 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -192,50 +192,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr) w->roSize = ndr->q.roMemTotal; } -void -ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, - int trueWgSize[], int trueWgSizeTotal, - LdsChunk *ldsChunk, uint64_t origSpillMemStart) -{ - wfCtx->cnt = cnt; - - VectorMask init_mask; - init_mask.reset(); - - for (int k = 0; k < wfSize(); ++k) { - if (k + cnt * wfSize() < trueWgSizeTotal) - init_mask[k] = 1; - } - - wfCtx->init_mask = init_mask.to_ullong(); - wfCtx->exec_mask = init_mask.to_ullong(); - - wfCtx->bar_cnt.resize(wfSize(), 0); - - wfCtx->max_bar_cnt = 0; - wfCtx->old_barrier_cnt = 0; - wfCtx->barrier_cnt = 0; - - wfCtx->privBase = ndr->q.privMemStart; - ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); - - wfCtx->spillBase = ndr->q.spillMemStart; - ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); - - wfCtx->pc = 0; - wfCtx->rpc = UINT32_MAX; - - // set the wavefront context to have a pointer to this section of the LDS - wfCtx->ldsChunk = ldsChunk; - - // WG state - wfCtx->wg_id = ndr->globalWgId; - wfCtx->barrier_id = barrier_id; - - // Kernel wide state - wfCtx->ndr = ndr; -} - void ComputeUnit::updateEvents() { @@ -264,19 +220,25 @@ ComputeUnit::updateEvents() { void -ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], - int trueWgSizeTotal) +ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal, + int cnt, LdsChunk *ldsChunk, NDRange *ndr) { static int _n_wave = 0; - int cnt = wfCtx->cnt; - NDRange *ndr = wfCtx->ndr; // Fill in Kernel state FillKernelState(w, ndr); + VectorMask init_mask; + init_mask.reset(); + + for (int k = 0; k < wfSize(); ++k) { + if (k + cnt * wfSize() < trueWgSizeTotal) + init_mask[k] = 1; + } + w->kern_id = ndr->dispatchId; w->dynwaveid = cnt; - w->init_mask = wfCtx->init_mask; + w->init_mask = init_mask.to_ullong(); for (int k = 0; k < wfSize(); ++k) { w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0]; @@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], w->workitemid[0][k]; } - w->old_barrier_cnt = wfCtx->old_barrier_cnt; - w->barrier_cnt = wfCtx->barrier_cnt; w->barrier_slots = divCeil(trueWgSizeTotal, wfSize()); - for (int i = 0; i < wfSize(); ++i) { - w->bar_cnt[i] = wfCtx->bar_cnt[i]; - } + w->bar_cnt.resize(wfSize(), 0); - w->max_bar_cnt = wfCtx->max_bar_cnt; - w->privBase = wfCtx->privBase; - w->spillBase = wfCtx->spillBase; + w->max_bar_cnt = 0; + w->old_barrier_cnt = 0; + w->barrier_cnt = 0; - w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask); + w->privBase = ndr->q.privMemStart; + ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize(); + + w->spillBase = ndr->q.spillMemStart; + ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize(); + + w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong()); // WG state - w->wg_id = wfCtx->wg_id; - w->dispatchid = wfCtx->ndr->dispatchId; + w->wg_id = ndr->globalWgId; + w->dispatchid = ndr->dispatchId; w->workgroupid[0] = w->wg_id % ndr->numWg[0]; w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1]; w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]); - w->barrier_id = wfCtx->barrier_id; + w->barrier_id = barrier_id; w->stalledAtBarrier = false; - // move this from the context into the actual wavefront - w->ldsChunk = wfCtx->ldsChunk; + // set the wavefront context to have a pointer to this section of the LDS + w->ldsChunk = ldsChunk; int32_t refCount M5_VAR_USED = lds.increaseRefCounter(w->dispatchid, w->wg_id); @@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId); w->start(++_n_wave, ndr->q.code_ptr); - wfCtx->bar_cnt.clear(); } void @@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) trueWgSizeTotal *= trueWgSize[d]; } - uint64_t origSpillMemStart = ndr->q.spillMemStart; // calculate the number of 32-bit vector registers required by wavefront int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount); int cnt = 0; @@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr) w->reservedVectorRegs = normSize; vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs; - WFContext wfCtx; - - InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal, - ldsChunk, origSpillMemStart); - - StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal); + StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr); ++cnt; } } diff --git a/src/gpu-compute/compute_unit.hh b/src/gpu-compute/compute_unit.hh index a234cbeb5..34b710cd6 100644 --- a/src/gpu-compute/compute_unit.hh +++ b/src/gpu-compute/compute_unit.hh @@ -256,12 +256,8 @@ class ComputeUnit : public MemObject void fetch(PacketPtr pkt, Wavefront *wavefront); void FillKernelState(Wavefront *w, NDRange *ndr); - void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[], - int trueWgSizeTotal); - - void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt, - int trueWgSize[], int trueWgSizeTotal, - LdsChunk *ldsChunk, uint64_t origSpillMemStart); + void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal, + int cnt, LdsChunk *ldsChunk, NDRange *ndr); void StartWorkgroup(NDRange *ndr); int ReadyWorkgroup(NDRange *ndr); diff --git a/src/gpu-compute/qstruct.hh b/src/gpu-compute/qstruct.hh index 7bca757b8..b400dc0ee 100644 --- a/src/gpu-compute/qstruct.hh +++ b/src/gpu-compute/qstruct.hh @@ -95,59 +95,6 @@ struct HsaQueueEntry uint16_t num_args; }; -// State used to start (or restart) a WF -struct WFContext -{ - // 32 bit values - // barrier state - std::vector bar_cnt; - - // id (which WF in the WG) - int cnt; - - // more barrier state - int max_bar_cnt; - int old_barrier_cnt; - int barrier_cnt; - - // More Program Counter Stuff - uint32_t pc; - - // Program counter of the immediate post-dominator instruction - uint32_t rpc; - - // WG wide state (I don't see how to avoid redundancy here) - int cu_id; - uint32_t wg_id; - uint32_t barrier_id; - - // 64 bit values (these values depend on the wavefront size) - // masks - uint64_t init_mask; - uint64_t exec_mask; - - // private memory; - Addr privBase; - Addr spillBase; - - LdsChunk *ldsChunk; - - /* - * Kernel wide state - * This is a hack. This state should be moved through simulated memory - * during a yield. Though not much is being used here, so it's probably - * probably not a big deal. - * - * Just to add to this comment... The ndr is derived from simulated - * memory when the cl-runtime allocates an HsaQueueEntry and populates it - * for a kernel launch. So in theory the runtime should be able to keep - * that state around. Then a WF can reference it upon restart to derive - * kernel wide state. The runtime can deallocate the state when the - * kernel completes. - */ - NDRange *ndr; -}; - // State that needs to be passed between the simulation and simulated app, a // pointer to this struct can be passed through the depends field in the // HsaQueueEntry struct