diff --git a/configs/example/apu_se.py b/configs/example/apu_se.py index b8ec149d5..5ec3289d2 100644 --- a/configs/example/apu_se.py +++ b/configs/example/apu_se.py @@ -153,7 +153,9 @@ parser.add_option('--fast-forward-pseudo-op', action='store_true', help = 'fast forward using kvm until the m5_switchcpu' ' pseudo-op is encountered, then switch cpus. subsequent' ' m5_switchcpu pseudo-ops will toggle back and forth') - +parser.add_option('--outOfOrderDataDelivery', action='store_true', + default=False, help='enable OoO data delivery in the GM' + ' pipeline') Ruby.define_options(parser) @@ -248,7 +250,9 @@ for i in xrange(n_cu): localDataStore = \ LdsState(banks = options.numLdsBanks, bankConflictPenalty = \ - options.ldsBankConflictPenalty))) + options.ldsBankConflictPenalty), + out_of_order_data_delivery = + options.outOfOrderDataDelivery)) wavefronts = [] vrfs = [] for j in xrange(options.simds_per_cu): diff --git a/src/arch/hsail/insts/decl.hh b/src/arch/hsail/insts/decl.hh index c40411ace..4c0bc9ce1 100644 --- a/src/arch/hsail/insts/decl.hh +++ b/src/arch/hsail/insts/decl.hh @@ -1082,7 +1082,7 @@ namespace HsailISA gpuDynInst->useContinuation = false; GlobalMemPipeline* gmp = &(w->computeUnit->globalMemoryPipe); - gmp->getGMReqFIFO().push(gpuDynInst); + gmp->issueRequest(gpuDynInst); w->wrGmReqsInPipe--; w->rdGmReqsInPipe--; diff --git a/src/arch/hsail/insts/mem_impl.hh b/src/arch/hsail/insts/mem_impl.hh index c175f2782..dbda6643b 100644 --- a/src/arch/hsail/insts/mem_impl.hh +++ b/src/arch/hsail/insts/mem_impl.hh @@ -263,7 +263,7 @@ namespace HsailISA } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -288,7 +288,7 @@ namespace HsailISA } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -312,7 +312,7 @@ namespace HsailISA } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -330,7 +330,7 @@ namespace HsailISA } } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; break; @@ -440,7 +440,7 @@ namespace HsailISA } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; break; @@ -460,7 +460,7 @@ namespace HsailISA } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; break; @@ -486,7 +486,7 @@ namespace HsailISA } } - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; break; @@ -591,7 +591,7 @@ namespace HsailISA m->latency.set(w->computeUnit->shader->ticks(64)); m->pipeId = GLBMEM_PIPE; - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; w->outstandingReqsRdGm++; diff --git a/src/arch/hsail/insts/pseudo_inst.cc b/src/arch/hsail/insts/pseudo_inst.cc index bfffb7d8f..580328aed 100644 --- a/src/arch/hsail/insts/pseudo_inst.cc +++ b/src/arch/hsail/insts/pseudo_inst.cc @@ -648,7 +648,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; w->outstandingReqsRdGm++; @@ -688,7 +688,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(64)); - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsWrGm++; w->wrGmReqsInPipe--; w->outstandingReqsRdGm++; @@ -727,7 +727,7 @@ namespace HsailISA m->pipeId = GLBMEM_PIPE; m->latency.set(w->computeUnit->shader->ticks(1)); - w->computeUnit->globalMemoryPipe.getGMReqFIFO().push(m); + w->computeUnit->globalMemoryPipe.issueRequest(m); w->outstandingReqsRdGm++; w->rdGmReqsInPipe--; w->outstandingReqs++; diff --git a/src/gpu-compute/GPU.py b/src/gpu-compute/GPU.py index b672f616c..0cb9e76a4 100644 --- a/src/gpu-compute/GPU.py +++ b/src/gpu-compute/GPU.py @@ -135,6 +135,8 @@ class ComputeUnit(MemObject): vector_register_file = VectorParam.VectorRegisterFile("Vector register "\ "file") + out_of_order_data_delivery = Param.Bool(False, "enable OoO data delivery" + " in the GM pipeline") class Shader(ClockedObject): type = 'Shader' diff --git a/src/gpu-compute/compute_unit.cc b/src/gpu-compute/compute_unit.cc index 93cffbe1e..ffa5243d2 100644 --- a/src/gpu-compute/compute_unit.cc +++ b/src/gpu-compute/compute_unit.cc @@ -1033,17 +1033,7 @@ ComputeUnit::DataPort::MemRespEvent::process() if (gpuDynInst->n_reg > MAX_REGS_FOR_NON_VEC_MEM_INST) gpuDynInst->statusVector.clear(); - if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { - assert(compute_unit->globalMemoryPipe.isGMLdRespFIFOWrRdy()); - - compute_unit->globalMemoryPipe.getGMLdRespFIFO() - .push(gpuDynInst); - } else { - assert(compute_unit->globalMemoryPipe.isGMStRespFIFOWrRdy()); - - compute_unit->globalMemoryPipe.getGMStRespFIFO() - .push(gpuDynInst); - } + compute_unit->globalMemoryPipe.handleResponse(gpuDynInst); DPRINTF(GPUMem, "CU%d: WF[%d][%d]: packet totally complete\n", compute_unit->cu_id, gpuDynInst->simdId, diff --git a/src/gpu-compute/global_memory_pipeline.cc b/src/gpu-compute/global_memory_pipeline.cc index f48af5a6f..7583ebb9b 100644 --- a/src/gpu-compute/global_memory_pipeline.cc +++ b/src/gpu-compute/global_memory_pipeline.cc @@ -45,7 +45,8 @@ GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams* p) : computeUnit(nullptr), gmQueueSize(p->global_mem_queue_size), - inflightStores(0), inflightLoads(0) + outOfOrderDataDelivery(p->out_of_order_data_delivery), inflightStores(0), + inflightLoads(0) { } @@ -61,8 +62,7 @@ void GlobalMemPipeline::exec() { // apply any returned global memory operations - GPUDynInstPtr m = !gmReturnedLoads.empty() ? gmReturnedLoads.front() : - !gmReturnedStores.empty() ? gmReturnedStores.front() : nullptr; + GPUDynInstPtr m = getNextReadyResp(); bool accessVrf = true; Wavefront *w = nullptr; @@ -74,30 +74,19 @@ GlobalMemPipeline::exec() accessVrf = w->computeUnit->vrf[w->simdId]-> - vrfOperandAccessReady(m->seqNum(), w, m, - VrfAccessType::WRITE); + vrfOperandAccessReady(m->seqNum(), w, m, VrfAccessType::WRITE); } - if ((!gmReturnedStores.empty() || !gmReturnedLoads.empty()) && - m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && + if (m && m->latency.rdy() && computeUnit->glbMemToVrfBus.rdy() && accessVrf && m->statusBitVector == VectorMask(0) && (computeUnit->shader->coissue_return || - computeUnit->wfWait.at(m->pipeId).rdy())) { + computeUnit->wfWait.at(m->pipeId).rdy())) { w = m->wavefront(); m->completeAcc(m); - if (m->isLoad() || m->isAtomic()) { - gmReturnedLoads.pop(); - assert(inflightLoads > 0); - --inflightLoads; - } else { - assert(m->isStore()); - gmReturnedStores.pop(); - assert(inflightStores > 0); - --inflightStores; - } + completeRequest(m); // Decrement outstanding register count computeUnit->shader->ScheduleAdd(&w->outstandingReqs, m->time, -1); @@ -129,15 +118,30 @@ GlobalMemPipeline::exec() } else { ++inflightLoads; } - } else { + } else if (mp->isStore()) { if (inflightStores >= gmQueueSize) { return; - } else if (mp->isStore()) { + } else { ++inflightStores; } } mp->initiateAcc(mp); + + if (!outOfOrderDataDelivery && !mp->isMemFence()) { + /** + * if we are not in out-of-order data delivery mode + * then we keep the responses sorted in program order. + * in order to do so we must reserve an entry in the + * resp buffer before we issue the request to the mem + * system. mem fence requests will not be stored here + * because once they are issued from the GM pipeline, + * they do not send any response back to it. + */ + gmOrderedRespBuffer.insert(std::make_pair(mp->seqNum(), + std::make_pair(mp, false))); + } + gmIssuedRequests.pop(); DPRINTF(GPUMem, "CU%d: WF[%d][%d] Popping 0 mem_op = \n", @@ -145,6 +149,86 @@ GlobalMemPipeline::exec() } } +GPUDynInstPtr +GlobalMemPipeline::getNextReadyResp() +{ + if (outOfOrderDataDelivery) { + if (!gmReturnedLoads.empty()) { + return gmReturnedLoads.front(); + } else if (!gmReturnedStores.empty()) { + return gmReturnedStores.front(); + } + } else { + if (!gmOrderedRespBuffer.empty()) { + auto mem_req = gmOrderedRespBuffer.begin(); + + if (mem_req->second.second) { + return mem_req->second.first; + } + } + } + + return nullptr; +} + +void +GlobalMemPipeline::completeRequest(GPUDynInstPtr gpuDynInst) +{ + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { + assert(inflightLoads > 0); + --inflightLoads; + } else if (gpuDynInst->isStore()) { + assert(inflightStores > 0); + --inflightStores; + } + + if (outOfOrderDataDelivery) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { + assert(!gmReturnedLoads.empty()); + gmReturnedLoads.pop(); + } else if (gpuDynInst->isStore()) { + assert(!gmReturnedStores.empty()); + gmReturnedStores.pop(); + } + } else { + // we should only pop the oldest requst, and it + // should be marked as done if we are here + assert(gmOrderedRespBuffer.begin()->first == gpuDynInst->seqNum()); + assert(gmOrderedRespBuffer.begin()->second.first == gpuDynInst); + assert(gmOrderedRespBuffer.begin()->second.second); + // remove this instruction from the buffer by its + // unique seq ID + gmOrderedRespBuffer.erase(gpuDynInst->seqNum()); + } +} + +void +GlobalMemPipeline::issueRequest(GPUDynInstPtr gpuDynInst) +{ + gmIssuedRequests.push(gpuDynInst); +} + +void +GlobalMemPipeline::handleResponse(GPUDynInstPtr gpuDynInst) +{ + if (outOfOrderDataDelivery) { + if (gpuDynInst->isLoad() || gpuDynInst->isAtomic()) { + assert(isGMLdRespFIFOWrRdy()); + gmReturnedLoads.push(gpuDynInst); + } else { + assert(isGMStRespFIFOWrRdy()); + gmReturnedStores.push(gpuDynInst); + } + } else { + auto mem_req = gmOrderedRespBuffer.find(gpuDynInst->seqNum()); + // if we are getting a response for this mem request, + // then it ought to already be in the ordered response + // buffer + assert(mem_req != gmOrderedRespBuffer.end()); + mem_req->second.second = true; + } +} + void GlobalMemPipeline::regStats() { diff --git a/src/gpu-compute/global_memory_pipeline.hh b/src/gpu-compute/global_memory_pipeline.hh index 368a15079..d10b7c1a2 100644 --- a/src/gpu-compute/global_memory_pipeline.hh +++ b/src/gpu-compute/global_memory_pipeline.hh @@ -62,10 +62,40 @@ class GlobalMemPipeline void init(ComputeUnit *cu); void exec(); - std::queue &getGMReqFIFO() { return gmIssuedRequests; } std::queue &getGMStRespFIFO() { return gmReturnedStores; } std::queue &getGMLdRespFIFO() { return gmReturnedLoads; } + /** + * find the next ready response to service. for OoO mode we + * simply pop the oldest (based on when the response was + * received) response in the response FIFOs. for in-order mode + * we pop the oldest (in program order) response, and only if + * it is marked as done. + */ + GPUDynInstPtr getNextReadyResp(); + + /** + * once a memory request is finished we remove it from the + * buffer. this method determines which response buffer + * we're using based on the mode (in-order vs. OoO). + */ + void completeRequest(GPUDynInstPtr gpuDynInst); + + /** + * issues a request to the pipeline - i.e., enqueue it + * in the request buffer. + */ + void issueRequest(GPUDynInstPtr gpuDynInst); + + /** + * this method handles responses sent to this GM pipeline by the + * CU. in the case of in-order delivery it simply marks the reqeust + * as done in the ordered buffer to indicate that the requst is + * finished. for out-of-order data delivery, the requests are enqueued + * (in the order in which they are received) in the response FIFOs. + */ + void handleResponse(GPUDynInstPtr gpuDynInst); + bool isGMLdRespFIFOWrRdy() const { @@ -97,6 +127,7 @@ class GlobalMemPipeline ComputeUnit *computeUnit; std::string _name; int gmQueueSize; + bool outOfOrderDataDelivery; // number of cycles of delaying the update of a VGPR that is the // target of a load instruction (or the load component of an atomic) @@ -111,6 +142,22 @@ class GlobalMemPipeline // The size of global memory. int globalMemSize; + /* + * this buffer holds the memory responses when in-order data + * deilvery is used - the responses are ordered by their unique + * sequence number, which is monotonically increasing. when a + * memory request returns its "done" flag is set to true. during + * each tick the the GM pipeline will check if the oldest request + * is finished, and if so it will be removed from the queue. + * + * key: memory instruction's sequence ID + * + * value: pair holding the instruction pointer and a bool that + * is used to indicate whether or not the request has + * completed + */ + std::map> gmOrderedRespBuffer; + // Global Memory Request FIFO: all global memory requests // are issued to this FIFO from the memory pipelines std::queue gmIssuedRequests;