hsail, gpu-compute: remove doGm/SmReturn add completeAcc

we are removing doGmReturn from the GM pipe, and adding completeAcc() implementations for the HSAIL mem ops. the behavior in doGmReturn is dependent on HSAIL and HSAIL mem ops, however the completion phase of memory ops in machine ISA can be very different, even amongst individual machine ISA mem ops. so we remove this functionality from the pipeline and allow it to be implemented by the individual instructions.
2016-10-26 22:47:19 -04:00 · 2016-10-26 22:47:19 -04:00 · 00a6346c91
commit 00a6346c91
parent 7ac38849ab
7 changed files with 225 additions and 220 deletions
--- a/src/arch/hsail/insts/mem.hh
+++ b/src/arch/hsail/insts/mem.hh
@ -36,9 +36,12 @@
 #ifndef __ARCH_HSAIL_INSTS_MEM_HH__
 #define __ARCH_HSAIL_INSTS_MEM_HH__
 #include <type_traits>
 #include "arch/hsail/insts/decl.hh"
 #include "arch/hsail/insts/gpu_static_inst.hh"
 #include "arch/hsail/operand.hh"
 #include "gpu-compute/compute_unit.hh"
 namespace HsailISA
 {
@ -491,6 +494,86 @@ namespace HsailISA
            gpuDynInst->updateStats();
        }
        void
        completeAcc(GPUDynInstPtr gpuDynInst) override
        {
            typedef typename MemDataType::CType c1;
            constexpr bool is_vt_32 = DestDataType::vgprType == VT_32;
            /**
              * this code essentially replaces the long if-else chain
              * that was in used GlobalMemPipeline::exec() to infer the
              * size (single/double) and type (floating point/integer) of
              * the destination register. this is needed for load
              * instructions because the loaded value and the
              * destination type can be of different sizes, and we also
              * need to know if the value we're writing back is floating
              * point and signed/unsigned, so we can properly cast the
              * writeback value
              */
            typedef typename std::conditional<is_vt_32,
                typename std::conditional<std::is_floating_point<c1>::value,
                    float, typename std::conditional<std::is_signed<c1>::value,
                    int32_t, uint32_t>::type>::type,
                typename std::conditional<std::is_floating_point<c1>::value,
                    double, typename std::conditional<std::is_signed<c1>::value,
                    int64_t, uint64_t>::type>::type>::type c0;
            Wavefront *w = gpuDynInst->wavefront();
            std::vector<uint32_t> regVec;
            // iterate over number of destination register operands since
            // this is a load
            for (int k = 0; k < num_dest_operands; ++k) {
                assert((sizeof(c1) * num_dest_operands)
                       <= MAX_WIDTH_FOR_MEM_INST);
                int dst = this->dest.regIndex() + k;
                if (num_dest_operands > MAX_REGS_FOR_NON_VEC_MEM_INST)
                    dst = dest_vect[k].regIndex();
                // virtual->physical VGPR mapping
                int physVgpr = w->remap(dst, sizeof(c0), 1);
                // save the physical VGPR index
                regVec.push_back(physVgpr);
                c1 *p1 =
                    &((c1*)gpuDynInst->d_data)[k * w->computeUnit->wfSize()];
                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                    if (gpuDynInst->exec_mask[i]) {
                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                "$%s%d <- %d global ld done (src = wavefront "
                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
                                dst, *p1);
                        // write the value into the physical VGPR. This is a
                        // purely functional operation. No timing is modeled.
                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
                                                                    *p1, i);
                    }
                    ++p1;
                }
            }
            // Schedule the write operation of the load data on the VRF.
            // This simply models the timing aspect of the VRF write operation.
            // It does not modify the physical VGPR.
            int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
                vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
                                     sizeof(c0), gpuDynInst->time);
            if (this->isGlobalMem()) {
                gpuDynInst->computeUnit()->globalMemoryPipe
                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
            } else {
                assert(this->isLocalMem());
                gpuDynInst->computeUnit()->localMemoryPipe
                    .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
            }
        }
      private:
        void
        execLdAcq(GPUDynInstPtr gpuDynInst) override
@ -941,6 +1024,11 @@ namespace HsailISA
            execSt(gpuDynInst);
        }
        // stores don't write anything back, so there is nothing
        // to do here. we only override this method to avoid the
        // fatal in the base class implementation
        void completeAcc(GPUDynInstPtr gpuDynInst) override { }
      private:
        // execSt may be called through a continuation
        // if the store had release semantics. see comment for
@ -1409,6 +1497,58 @@ namespace HsailISA
        }
        void
        completeAcc(GPUDynInstPtr gpuDynInst) override
        {
            // if this is not an atomic return op, then we
            // have nothing more to do.
            if (this->isAtomicRet()) {
                // the size of the src operands and the
                // memory being operated on must match
                // for HSAIL atomics - this assumption may
                // not apply to all ISAs
                typedef typename MemDataType::CType CType;
                Wavefront *w = gpuDynInst->wavefront();
                int dst = this->dest.regIndex();
                std::vector<uint32_t> regVec;
                // virtual->physical VGPR mapping
                int physVgpr = w->remap(dst, sizeof(CType), 1);
                regVec.push_back(physVgpr);
                CType *p1 = &((CType*)gpuDynInst->d_data)[0];
                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                    if (gpuDynInst->exec_mask[i]) {
                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                "$%s%d <- %d global ld done (src = wavefront "
                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
                                w->wfSlotId, i, sizeof(CType) == 4 ? "s" : "d",
                                dst, *p1);
                        // write the value into the physical VGPR. This is a
                        // purely functional operation. No timing is modeled.
                        w->computeUnit->vrf[w->simdId]->write<CType>(physVgpr, *p1, i);
                    }
                    ++p1;
                }
                // Schedule the write operation of the load data on the VRF.
                // This simply models the timing aspect of the VRF write operation.
                // It does not modify the physical VGPR.
                int loadVrfBankConflictCycles = gpuDynInst->computeUnit()->
                    vrf[w->simdId]->exec(gpuDynInst->seqNum(), w, regVec,
                                         sizeof(CType), gpuDynInst->time);
                if (this->isGlobalMem()) {
                    gpuDynInst->computeUnit()->globalMemoryPipe
                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
                } else {
                    assert(this->isLocalMem());
                    gpuDynInst->computeUnit()->localMemoryPipe
                        .incLoadVRFBankConflictCycles(loadVrfBankConflictCycles);
                }
            }
        }
        void execute(GPUDynInstPtr gpuDynInst) override;
      private:
--- a/src/gpu-compute/global_memory_pipeline.cc
+++ b/src/gpu-compute/global_memory_pipeline.cc
@ -65,13 +65,15 @@ GlobalMemPipeline::exec()
        !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr;
    bool accessVrf = true;
    Wavefront *w = nullptr;
    // check the VRF to see if the operands of a load (or load component
    // of an atomic) are accessible
    if ((m) && (m->isLoad() || m->isAtomicRet())) {
-        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+        w = m->wavefront();
        accessVrf =
-            w->computeUnit->vrf[m->simdId]->
+            w->computeUnit->vrf[w->simdId]->
            vrfOperandAccessReady(m->seqNum(), w, m,
                                  VrfAccessType::WRITE);
    }
@ -82,44 +84,38 @@ GlobalMemPipeline::exec()
        (computeUnit->shader->coissue_return ||
         computeUnit->wfWait.at(m->pipeId).rdy())) {
-        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
+        w = m->wavefront();
-            doGmReturn<uint32_t, uint8_t>(m);
+
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
+        m->completeAcc(m);
-            doGmReturn<uint32_t, uint16_t>(m);
+
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
+        if (m->isLoad() || m->isAtomic()) {
-            doGmReturn<uint32_t, uint32_t>(m);
+            gmReturnedLoads.pop();
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
+            assert(inflightLoads > 0);
-            doGmReturn<int32_t, int8_t>(m);
+            --inflightLoads;
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
+        } else {
-            doGmReturn<int32_t, int16_t>(m);
+            assert(m->isStore());
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
+            gmReturnedStores.pop();
-            doGmReturn<int32_t, int32_t>(m);
+            assert(inflightStores > 0);
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
+            --inflightStores;
-            doGmReturn<float, Float16>(m);
+        }
-        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
+
-            doGmReturn<float, float>(m);
+        // Decrement outstanding register count
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
+        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
-            doGmReturn<uint64_t, uint8_t>(m);
+
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
+        if (m->isStore() || m->isAtomic()) {
-            doGmReturn<uint64_t, uint16_t>(m);
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm,
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
+                                             m->time, -1);
-            doGmReturn<uint64_t, uint32_t>(m);
+        }
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
+
-            doGmReturn<uint64_t, uint64_t>(m);
+        if (m->isLoad() || m->isAtomic()) {
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
+            computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm,
-            doGmReturn<int64_t, int8_t>(m);
+                                             m->time, -1);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
+        }
-            doGmReturn<int64_t, int16_t>(m);
+
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
+        // Mark write bus busy for appropriate amount of time
-            doGmReturn<int64_t, int32_t>(m);
+        computeUnit->glbMemToVrfBus.set(m->time);
-        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
+        if (!computeUnit->shader->coissue_return)
-            doGmReturn<int64_t, int64_t>(m);
+            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
            doGmReturn<double, Float16>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
            doGmReturn<double, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
            doGmReturn<double, double>(m);
    }
    // If pipeline has executed a global memory instruction
@ -149,83 +145,6 @@ GlobalMemPipeline::exec()
    }
 }
 template<typename c0, typename c1>
 void
 GlobalMemPipeline::doGmReturn(GPUDynInstPtr m)
 {
    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
    // Return data to registers
    if (m->isLoad() || m->isAtomic()) {
        gmReturnedLoads.pop();
        assert(inflightLoads > 0);
        --inflightLoads;
        if (m->isLoad() || m->isAtomicRet()) {
            std::vector<uint32_t> regVec;
            // iterate over number of destination register operands since
            // this is a load or atomic operation
            for (int k = 0; k < m->n_reg; ++k) {
                assert((sizeof(c1) * m->n_reg) <= MAX_WIDTH_FOR_MEM_INST);
                int dst = m->dst_reg + k;
                if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
                    dst = m->dst_reg_vec[k];
                // virtual->physical VGPR mapping
                int physVgpr = w->remap(dst, sizeof(c0), 1);
                // save the physical VGPR index
                regVec.push_back(physVgpr);
                c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
                for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                    if (m->exec_mask[i]) {
                        DPRINTF(GPUReg, "CU%d, WF[%d][%d], lane %d: "
                                "$%s%d <- %d global ld done (src = wavefront "
                                "ld inst)\n", w->computeUnit->cu_id, w->simdId,
                                w->wfSlotId, i, sizeof(c0) == 4 ? "s" : "d",
                                dst, *p1);
                        // write the value into the physical VGPR. This is a
                        // purely functional operation. No timing is modeled.
                        w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
                                                                    *p1, i);
                    }
                    ++p1;
                }
            }
            // Schedule the write operation of the load data on the VRF.
            // This simply models the timing aspect of the VRF write operation.
            // It does not modify the physical VGPR.
            loadVrfBankConflictCycles +=
                w->computeUnit->vrf[w->simdId]->exec(m->seqNum(),
                                                     w, regVec, sizeof(c0),
                                                     m->time);
        }
    } else {
        gmReturnedStores.pop();
        assert(inflightStores > 0);
        --inflightStores;
    }
    // Decrement outstanding register count
    computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
    if (m->isStore() || m->isAtomic()) {
        computeUnit->shader->ScheduleAdd(&w->outstandingReqsWrGm, m->time,
                                         -1);
    }
    if (m->isLoad() || m->isAtomic()) {
        computeUnit->shader->ScheduleAdd(&w->outstandingReqsRdGm, m->time,
                                         -1);
    }
    // Mark write bus busy for appropriate amount of time
    computeUnit->glbMemToVrfBus.set(m->time);
    if (!computeUnit->shader->coissue_return)
        w->computeUnit->wfWait.at(m->pipeId).set(m->time);
 }
 void
 GlobalMemPipeline::regStats()
 {
--- a/src/gpu-compute/global_memory_pipeline.hh
+++ b/src/gpu-compute/global_memory_pipeline.hh
@ -62,8 +62,6 @@ class GlobalMemPipeline
    void init(ComputeUnit *cu);
    void exec();
    template<typename c0, typename c1> void doGmReturn(GPUDynInstPtr m);
    std::queue<GPUDynInstPtr> &getGMReqFIFO() { return gmIssuedRequests; }
    std::queue<GPUDynInstPtr> &getGMStRespFIFO() { return gmReturnedStores; }
    std::queue<GPUDynInstPtr> &getGMLdRespFIFO() { return gmReturnedLoads; }
@ -89,6 +87,12 @@ class GlobalMemPipeline
    const std::string &name() const { return _name; }
    void regStats();
    void
    incLoadVRFBankConflictCycles(int num_cycles)
    {
        loadVrfBankConflictCycles += num_cycles;
    }
  private:
    ComputeUnit *computeUnit;
    std::string _name;
--- a/src/gpu-compute/gpu_dyn_inst.cc
+++ b/src/gpu-compute/gpu_dyn_inst.cc
@ -155,6 +155,12 @@ GPUDynInst::initiateAcc(GPUDynInstPtr gpuDynInst)
    time = 0;
 }
 void
 GPUDynInst::completeAcc(GPUDynInstPtr gpuDynInst)
 {
    _staticInst->completeAcc(gpuDynInst);
 }
 /**
 * accessor methods for the attributes of
 * the underlying GPU static instruction
--- a/src/gpu-compute/gpu_dyn_inst.hh
+++ b/src/gpu-compute/gpu_dyn_inst.hh
@ -258,6 +258,10 @@ class GPUDynInst : public GPUExecContext
    // Initiate the specified memory operation, by creating a
    // memory request and sending it off to the memory system.
    void initiateAcc(GPUDynInstPtr gpuDynInst);
    // Complete the specified memory operation, by writing
    // value back to the RF in the case of a load or atomic
    // return or, in the case of a store, we do nothing
    void completeAcc(GPUDynInstPtr gpuDynInst);
    void updateStats();
--- a/src/gpu-compute/local_memory_pipeline.cc
+++ b/src/gpu-compute/local_memory_pipeline.cc
@ -62,11 +62,13 @@ LocalMemPipeline::exec()
        lmReturnedRequests.front() : nullptr;
    bool accessVrf = true;
    Wavefront *w = nullptr;
    if ((m) && (m->isLoad() || m->isAtomicRet())) {
-        Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+        w = m->wavefront();
        accessVrf =
-            w->computeUnit->vrf[m->simdId]->
+            w->computeUnit->vrf[w->simdId]->
            vrfOperandAccessReady(m->seqNum(), w, m,
                                  VrfAccessType::WRITE);
    }
@ -74,100 +76,11 @@ LocalMemPipeline::exec()
    if (!lmReturnedRequests.empty() && m->latency.rdy() && accessVrf &&
        computeUnit->locMemToVrfBus.rdy() && (computeUnit->shader->coissue_return
                 || computeUnit->wfWait.at(m->pipeId).rdy())) {
        if (m->v_type == VT_32 && m->m_type == Enums::M_U8)
            doSmReturn<uint32_t, uint8_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_U16)
            doSmReturn<uint32_t, uint16_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_U32)
            doSmReturn<uint32_t, uint32_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S8)
            doSmReturn<int32_t, int8_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S16)
            doSmReturn<int32_t, int16_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_S32)
            doSmReturn<int32_t, int32_t>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_F16)
            doSmReturn<float, Float16>(m);
        else if (m->v_type == VT_32 && m->m_type == Enums::M_F32)
            doSmReturn<float, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U8)
            doSmReturn<uint64_t, uint8_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U16)
            doSmReturn<uint64_t, uint16_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U32)
            doSmReturn<uint64_t, uint32_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_U64)
            doSmReturn<uint64_t, uint64_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S8)
            doSmReturn<int64_t, int8_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S16)
            doSmReturn<int64_t, int16_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S32)
            doSmReturn<int64_t, int32_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_S64)
            doSmReturn<int64_t, int64_t>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F16)
            doSmReturn<double, Float16>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F32)
            doSmReturn<double, float>(m);
        else if (m->v_type == VT_64 && m->m_type == Enums::M_F64)
            doSmReturn<double, double>(m);
    }
    // If pipeline has executed a local memory instruction
    // execute local memory packet and issue the packets
    // to LDS
    if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
        GPUDynInstPtr m = lmIssuedRequests.front();
        bool returnVal = computeUnit->sendToLds(m);
        if (!returnVal) {
            DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
        }
        lmIssuedRequests.pop();
    }
 }
 template<typename c0, typename c1>
 void
 LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
 {
        lmReturnedRequests.pop();
-    Wavefront *w = computeUnit->wfList[m->simdId][m->wfSlotId];
+        w = m->wavefront();
-    // Return data to registers
+        m->completeAcc(m);
    if (m->isLoad() || m->isAtomicRet()) {
        std::vector<uint32_t> regVec;
        for (int k = 0; k < m->n_reg; ++k) {
            int dst = m->dst_reg+k;
            if (m->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST)
                dst = m->dst_reg_vec[k];
            // virtual->physical VGPR mapping
            int physVgpr = w->remap(dst,sizeof(c0),1);
            // save the physical VGPR index
            regVec.push_back(physVgpr);
            c1 *p1 = &((c1 *)m->d_data)[k * w->computeUnit->wfSize()];
            for (int i = 0; i < w->computeUnit->wfSize(); ++i) {
                if (m->exec_mask[i]) {
                    // write the value into the physical VGPR. This is a purely
                    // functional operation. No timing is modeled.
                    w->computeUnit->vrf[w->simdId]->write<c0>(physVgpr,
                                                                *p1, i);
                }
                ++p1;
            }
        }
        // Schedule the write operation of the load data on the VRF. This simply
        // models the timing aspect of the VRF write operation. It does not
        // modify the physical VGPR.
        loadVrfBankConflictCycles +=
            w->computeUnit->vrf[w->simdId]->exec(m->seqNum(), w,
                                                 regVec, sizeof(c0), m->time);
    }
        // Decrement outstanding request count
        computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1);
@ -188,6 +101,21 @@ LocalMemPipeline::doSmReturn(GPUDynInstPtr m)
            w->computeUnit->wfWait.at(m->pipeId).set(m->time);
    }
    // If pipeline has executed a local memory instruction
    // execute local memory packet and issue the packets
    // to LDS
    if (!lmIssuedRequests.empty() && lmReturnedRequests.size() < lmQueueSize) {
        GPUDynInstPtr m = lmIssuedRequests.front();
        bool returnVal = computeUnit->sendToLds(m);
        if (!returnVal) {
            DPRINTF(GPUPort, "packet was nack'd and put in retry queue");
        }
        lmIssuedRequests.pop();
    }
 }
 void
 LocalMemPipeline::regStats()
 {
--- a/src/gpu-compute/local_memory_pipeline.hh
+++ b/src/gpu-compute/local_memory_pipeline.hh
@ -61,8 +61,6 @@ class LocalMemPipeline
    void init(ComputeUnit *cu);
    void exec();
    template<typename c0, typename c1> void doSmReturn(GPUDynInstPtr m);
    std::queue<GPUDynInstPtr> &getLMReqFIFO() { return lmIssuedRequests; }
    std::queue<GPUDynInstPtr> &getLMRespFIFO() { return lmReturnedRequests; }
@ -81,6 +79,12 @@ class LocalMemPipeline
    const std::string& name() const { return _name; }
    void regStats();
    void
    incLoadVRFBankConflictCycles(int num_cycles)
    {
        loadVrfBankConflictCycles += num_cycles;
    }
  private:
    ComputeUnit *computeUnit;
    std::string _name;