gpu-compute: Remove WFContext

WFContext struct is currently unused and it has been rendered not useful in saving and restoring the context of a Wavefront. Wavefront class should be sufficient for that purpose and the runtime can figure out the memory size it will need to allocate for a Wavefront through an IOCTL.
2016-09-16 12:26:03 -04:00 · 2016-09-16 12:26:03 -04:00 · e9fe1b838b
commit e9fe1b838b
parent ada0e2f02f
3 changed files with 30 additions and 130 deletions
--- a/src/gpu-compute/compute_unit.cc
+++ b/src/gpu-compute/compute_unit.cc
@ -192,50 +192,6 @@ ComputeUnit::FillKernelState(Wavefront *w, NDRange *ndr)
    w->roSize = ndr->q.roMemTotal;
 }
 void
 ComputeUnit::InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
                        int trueWgSize[], int trueWgSizeTotal,
                        LdsChunk *ldsChunk, uint64_t origSpillMemStart)
 {
    wfCtx->cnt = cnt;
    VectorMask init_mask;
    init_mask.reset();
    for (int k = 0; k < wfSize(); ++k) {
        if (k + cnt * wfSize() < trueWgSizeTotal)
            init_mask[k] = 1;
    }
    wfCtx->init_mask = init_mask.to_ullong();
    wfCtx->exec_mask = init_mask.to_ullong();
    wfCtx->bar_cnt.resize(wfSize(), 0);
    wfCtx->max_bar_cnt = 0;
    wfCtx->old_barrier_cnt = 0;
    wfCtx->barrier_cnt = 0;
    wfCtx->privBase = ndr->q.privMemStart;
    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
    wfCtx->spillBase = ndr->q.spillMemStart;
    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
    wfCtx->pc = 0;
    wfCtx->rpc = UINT32_MAX;
    // set the wavefront context to have a pointer to this section of the LDS
    wfCtx->ldsChunk = ldsChunk;
    // WG state
    wfCtx->wg_id = ndr->globalWgId;
    wfCtx->barrier_id = barrier_id;
    // Kernel wide state
    wfCtx->ndr = ndr;
 }
 void
 ComputeUnit::updateEvents() {
@ -264,19 +220,25 @@ ComputeUnit::updateEvents() {
 void
-ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+ComputeUnit::StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
-                     int trueWgSizeTotal)
+                     int cnt, LdsChunk *ldsChunk, NDRange *ndr)
 {
    static int _n_wave = 0;
    int cnt = wfCtx->cnt;
    NDRange *ndr = wfCtx->ndr;
    // Fill in Kernel state
    FillKernelState(w, ndr);
    VectorMask init_mask;
    init_mask.reset();
    for (int k = 0; k < wfSize(); ++k) {
        if (k + cnt * wfSize() < trueWgSizeTotal)
            init_mask[k] = 1;
    }
    w->kern_id = ndr->dispatchId;
    w->dynwaveid = cnt;
-    w->init_mask = wfCtx->init_mask;
+    w->init_mask = init_mask.to_ullong();
    for (int k = 0; k < wfSize(); ++k) {
        w->workitemid[0][k] = (k+cnt*wfSize()) % trueWgSize[0];
@ -290,32 +252,34 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
            w->workitemid[0][k];
    }
    w->old_barrier_cnt = wfCtx->old_barrier_cnt;
    w->barrier_cnt = wfCtx->barrier_cnt;
    w->barrier_slots = divCeil(trueWgSizeTotal, wfSize());
-    for (int i = 0; i < wfSize(); ++i) {
+    w->bar_cnt.resize(wfSize(), 0);
        w->bar_cnt[i] = wfCtx->bar_cnt[i];
    }
-    w->max_bar_cnt = wfCtx->max_bar_cnt;
+    w->max_bar_cnt = 0;
-    w->privBase = wfCtx->privBase;
+    w->old_barrier_cnt = 0;
-    w->spillBase = wfCtx->spillBase;
+    w->barrier_cnt = 0;
-    w->pushToReconvergenceStack(wfCtx->pc, wfCtx->rpc, wfCtx->exec_mask);
+    w->privBase = ndr->q.privMemStart;
    ndr->q.privMemStart += ndr->q.privMemPerItem * wfSize();
    w->spillBase = ndr->q.spillMemStart;
    ndr->q.spillMemStart += ndr->q.spillMemPerItem * wfSize();
    w->pushToReconvergenceStack(0, UINT32_MAX, init_mask.to_ulong());
    // WG state
-    w->wg_id = wfCtx->wg_id;
+    w->wg_id = ndr->globalWgId;
-    w->dispatchid = wfCtx->ndr->dispatchId;
+    w->dispatchid = ndr->dispatchId;
    w->workgroupid[0] = w->wg_id % ndr->numWg[0];
    w->workgroupid[1] = (w->wg_id / ndr->numWg[0]) % ndr->numWg[1];
    w->workgroupid[2] = w->wg_id / (ndr->numWg[0] * ndr->numWg[1]);
-    w->barrier_id = wfCtx->barrier_id;
+    w->barrier_id = barrier_id;
    w->stalledAtBarrier = false;
-    // move this from the context into the actual wavefront
+    // set the wavefront context to have a pointer to this section of the LDS
-    w->ldsChunk = wfCtx->ldsChunk;
+    w->ldsChunk = ldsChunk;
    int32_t refCount M5_VAR_USED =
                    lds.increaseRefCounter(w->dispatchid, w->wg_id);
@ -340,7 +304,6 @@ ComputeUnit::StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
            "WF[%d][%d]\n", _n_wave, barrier_id, cu_id, w->simdId, w->wfSlotId);
    w->start(++_n_wave, ndr->q.code_ptr);
    wfCtx->bar_cnt.clear();
 }
 void
@ -376,7 +339,6 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
        trueWgSizeTotal *= trueWgSize[d];
    }
    uint64_t origSpillMemStart = ndr->q.spillMemStart;
    // calculate the number of 32-bit vector registers required by wavefront
    int vregDemand = ndr->q.sRegCount + (2 * ndr->q.dRegCount);
    int cnt = 0;
@ -403,12 +365,7 @@ ComputeUnit::StartWorkgroup(NDRange *ndr)
            w->reservedVectorRegs = normSize;
            vectorRegsReserved[m % numSIMDs] += w->reservedVectorRegs;
-            WFContext wfCtx;
+            StartWF(w, trueWgSize, trueWgSizeTotal, cnt, ldsChunk, ndr);
            InitializeWFContext(&wfCtx, ndr, cnt, trueWgSize, trueWgSizeTotal,
                                ldsChunk, origSpillMemStart);
            StartWF(w, &wfCtx, trueWgSize, trueWgSizeTotal);
            ++cnt;
        }
    }
--- a/src/gpu-compute/compute_unit.hh
+++ b/src/gpu-compute/compute_unit.hh
@ -256,12 +256,8 @@ class ComputeUnit : public MemObject
    void fetch(PacketPtr pkt, Wavefront *wavefront);
    void FillKernelState(Wavefront *w, NDRange *ndr);
-    void StartWF(Wavefront *w, WFContext *wfCtx, int trueWgSize[],
+    void StartWF(Wavefront *w, int trueWgSize[], int trueWgSizeTotal,
-                 int trueWgSizeTotal);
+                     int cnt, LdsChunk *ldsChunk, NDRange *ndr);
    void InitializeWFContext(WFContext *wfCtx, NDRange *ndr, int cnt,
                             int trueWgSize[], int trueWgSizeTotal,
                             LdsChunk *ldsChunk, uint64_t origSpillMemStart);
    void StartWorkgroup(NDRange *ndr);
    int ReadyWorkgroup(NDRange *ndr);
--- a/src/gpu-compute/qstruct.hh
+++ b/src/gpu-compute/qstruct.hh
@ -95,59 +95,6 @@ struct HsaQueueEntry
    uint16_t num_args;
 };
 // State used to start (or restart) a WF
 struct WFContext
 {
    // 32 bit values
    // barrier state
    std::vector<int> bar_cnt;
    // id (which WF in the WG)
    int cnt;
    // more barrier state
    int max_bar_cnt;
    int old_barrier_cnt;
    int barrier_cnt;
    // More Program Counter Stuff
    uint32_t pc;
    // Program counter of the immediate post-dominator instruction
    uint32_t rpc;
    // WG wide state (I don't see how to avoid redundancy here)
    int cu_id;
    uint32_t wg_id;
    uint32_t barrier_id;
    // 64 bit values (these values depend on the wavefront size)
    // masks
    uint64_t init_mask;
    uint64_t exec_mask;
    // private memory;
    Addr privBase;
    Addr spillBase;
    LdsChunk *ldsChunk;
    /*
     * Kernel wide state
     * This is a hack. This state should be moved through simulated memory
     * during a yield. Though not much is being used here, so it's probably
     * probably not a big deal.
     *
     * Just to add to this comment... The ndr is derived from simulated
     * memory when the cl-runtime allocates an HsaQueueEntry and populates it
     * for a kernel launch. So in theory the runtime should be able to keep
     * that state around. Then a WF can reference it upon restart to derive
     * kernel wide state. The runtime can deallocate the state when the
     * kernel completes.
     */
    NDRange *ndr;
 };
 // State that needs to be passed between the simulation and simulated app, a
 // pointer to this struct can be passed through the depends field in the
 // HsaQueueEntry struct