diff --git a/cpu/ozone/front_end.hh b/cpu/ozone/front_end.hh index dd382491f..b677e667c 100644 --- a/cpu/ozone/front_end.hh +++ b/cpu/ozone/front_end.hh @@ -31,6 +31,7 @@ #include +#include "base/timebuf.hh" #include "cpu/inst_seq.hh" #include "cpu/o3/bpred_unit.hh" #include "cpu/ozone/rename_table.hh" @@ -210,15 +211,21 @@ class FrontEnd void dumpInsts(); private: + TimeBuffer numInstsReady; + typedef typename std::deque InstBuff; typedef typename InstBuff::iterator InstBuffIt; + InstBuff feBuffer; + InstBuff instBuffer; int instBufferSize; int maxInstBufferSize; + int latency; + int width; int freeRegs; diff --git a/cpu/ozone/front_end_impl.hh b/cpu/ozone/front_end_impl.hh index ca9948b7d..09fc2e2f8 100644 --- a/cpu/ozone/front_end_impl.hh +++ b/cpu/ozone/front_end_impl.hh @@ -41,8 +41,10 @@ template FrontEnd::FrontEnd(Params *params) : branchPred(params), icacheInterface(params->icacheInterface), + numInstsReady(params->frontEndLatency, 0), instBufferSize(0), maxInstBufferSize(params->maxInstBufferSize), + latency(params->frontEndLatency), width(params->frontEndWidth), freeRegs(params->numPhysicalRegs), numPhysRegs(params->numPhysicalRegs), @@ -261,6 +263,18 @@ FrontEnd::tick() if (switchedOut) return; + for (int insts_to_queue = numInstsReady[-latency]; + !instBuffer.empty() && insts_to_queue; + --insts_to_queue) + { + DPRINTF(FE, "Transferring instruction [sn:%lli] to the feBuffer\n", + instBuffer.front()->seqNum); + feBuffer.push_back(instBuffer.front()); + instBuffer.pop_front(); + } + + numInstsReady.advance(); + // @todo: Maybe I want to just have direct communication... if (fromCommit->doneSeqNum) { branchPred.update(fromCommit->doneSeqNum, 0); @@ -349,6 +363,7 @@ FrontEnd::tick() // latency instBuffer.push_back(inst); ++instBufferSize; + numInstsReady[0]++; ++num_inst; #if FULL_SYSTEM @@ -570,6 +585,7 @@ FrontEnd::handleFault(Fault &fault) instruction->fault = fault; instruction->setCanIssue(); instBuffer.push_back(instruction); + numInstsReady[0]++; ++instBufferSize; } @@ -599,6 +615,21 @@ FrontEnd::squash(const InstSeqNum &squash_num, const Addr &next_PC, freeRegs+= inst->numDestRegs(); } + while (!feBuffer.empty() && + feBuffer.back()->seqNum > squash_num) { + DynInstPtr inst = feBuffer.back(); + + DPRINTF(FE, "Squashing instruction [sn:%lli] PC %#x\n", + inst->seqNum, inst->readPC()); + + inst->clearDependents(); + + feBuffer.pop_back(); + --instBufferSize; + + freeRegs+= inst->numDestRegs(); + } + // Copy over rename table from the back end. renameTable.copyFrom(backEnd->renameTable); @@ -633,13 +664,13 @@ template typename Impl::DynInstPtr FrontEnd::getInst() { - if (instBufferSize == 0) { + if (feBuffer.empty()) { return NULL; } - DynInstPtr inst = instBuffer.front(); + DynInstPtr inst = feBuffer.front(); - instBuffer.pop_front(); + feBuffer.pop_front(); --instBufferSize; @@ -857,6 +888,7 @@ FrontEnd::doSwitchOut() squash(0, 0); instBuffer.clear(); instBufferSize = 0; + feBuffer.clear(); status = Idle; } diff --git a/cpu/ozone/lw_back_end.hh b/cpu/ozone/lw_back_end.hh index 19f2b2b61..4e2f5606c 100644 --- a/cpu/ozone/lw_back_end.hh +++ b/cpu/ozone/lw_back_end.hh @@ -78,7 +78,7 @@ class LWBackEnd TimeBuffer i2e; typename TimeBuffer::wire instsToExecute; TimeBuffer e2c; - TimeBuffer numInstsToWB; + TimeBuffer numInstsToWB; TimeBuffer *comm; typename TimeBuffer::wire toIEW; @@ -157,7 +157,7 @@ class LWBackEnd Tick lastCommitCycle; - bool robEmpty() { return instList.empty(); } + bool robEmpty() { return numInsts == 0; } bool isFull() { return numInsts >= numROBEntries; } bool isBlocked() { return status == Blocked || dispatchStatus == Blocked; } @@ -212,6 +212,7 @@ class LWBackEnd } void instToCommit(DynInstPtr &inst); + void readyInstsForCommit(); void switchOut(); void doSwitchOut(); @@ -293,12 +294,13 @@ class LWBackEnd MemReqPtr memReq; + int latency; + // General back end width. Used if the more specific isn't given. int width; // Dispatch width. int dispatchWidth; - int numDispatchEntries; int dispatchSize; int waitingInsts; @@ -323,6 +325,7 @@ class LWBackEnd int numROBEntries; int numInsts; + bool lsqLimits; std::set waitingMemOps; typedef std::set::iterator MemIt; @@ -333,9 +336,6 @@ class LWBackEnd InstSeqNum squashSeqNum; Addr squashNextPC; - Fault faultFromFetch; - bool fetchHasFault; - bool switchedOut; bool switchPending; @@ -359,8 +359,6 @@ class LWBackEnd std::list replayList; std::list writeback; - int latency; - int squashLatency; bool exactFullStall; @@ -397,9 +395,11 @@ class LWBackEnd Stats::Scalar<> lsqInversion; Stats::Vector<> nIssuedDist; +/* Stats::VectorDistribution<> issueDelayDist; Stats::VectorDistribution<> queueResDist; +*/ /* Stats::Vector<> stat_fu_busy; Stats::Vector2d<> stat_fuBusy; @@ -447,7 +447,7 @@ class LWBackEnd Stats::Vector<> ROBCount; // cumulative ROB occupancy Stats::Formula ROBOccRate; - Stats::VectorDistribution<> ROBOccDist; +// Stats::VectorDistribution<> ROBOccDist; public: void dumpInsts(); diff --git a/cpu/ozone/lw_back_end_impl.hh b/cpu/ozone/lw_back_end_impl.hh index 9e1cd28cf..9a6ad4c14 100644 --- a/cpu/ozone/lw_back_end_impl.hh +++ b/cpu/ozone/lw_back_end_impl.hh @@ -151,8 +151,10 @@ LWBackEnd::LdWritebackEvent::process() // iewStage->wakeCPU(); - if (be->isSwitchedOut()) - return; + assert(inst->isSquashed() || !be->isSwitchedOut()); + +// if (be->isSwitchedOut() && inst->isLoad()) +// return; if (dcacheMiss) { be->removeDcacheMiss(inst); @@ -208,14 +210,14 @@ LWBackEnd::DCacheCompletionEvent::description() template LWBackEnd::LWBackEnd(Params *params) - : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(5, 5), + : d2i(5, 5), i2e(5, 5), e2c(5, 5), numInstsToWB(params->backEndLatency, 0), trapSquash(false), xcSquash(false), cacheCompletionEvent(this), - dcacheInterface(params->dcacheInterface), width(params->backEndWidth), + dcacheInterface(params->dcacheInterface), latency(params->backEndLatency), + width(params->backEndWidth), lsqLimits(params->lsqLimits), exactFullStall(true) { numROBEntries = params->numROBEntries; numInsts = 0; - numDispatchEntries = 32; maxOutstandingMemOps = params->maxOutstandingMemOps; numWaitingMemOps = 0; waitingInsts = 0; @@ -251,6 +253,8 @@ void LWBackEnd::regStats() { using namespace Stats; + LSQ.regStats(); + robCapEvents .init(cpu->number_of_threads) .name(name() + ".ROB:cap_events") @@ -377,6 +381,7 @@ LWBackEnd::regStats() .desc("Number of insts issued each cycle") .flags(total | pdf | dist) ; +/* issueDelayDist .init(Num_OpClasses,0,99,2) .name(name() + ".ISSUE:") @@ -393,7 +398,7 @@ LWBackEnd::regStats() for (int i = 0; i < Num_OpClasses; ++i) { queueResDist.subname(i, opClassStrings[i]); } - +*/ writebackCount .init(cpu->number_of_threads) .name(name() + ".WB:count") @@ -555,13 +560,14 @@ LWBackEnd::regStats() .flags(total) ; ROBOccRate = ROBCount / cpu->numCycles; - +/* ROBOccDist .init(cpu->number_of_threads,0,numROBEntries,2) .name(name() + ".ROB:occ_dist") .desc("ROB Occupancy per cycle") .flags(total | cdf) ; +*/ } template @@ -654,18 +660,22 @@ LWBackEnd::tick() { DPRINTF(BE, "Ticking back end\n"); + // Read in any done instruction information and update the IQ or LSQ. + updateStructures(); + if (switchPending && robEmpty() && !LSQ.hasStoresToWB()) { cpu->signalSwitched(); return; } + readyInstsForCommit(); + + numInstsToWB.advance(); + ROBCount[0]+= numInsts; wbCycle = 0; - // Read in any done instruction information and update the IQ or LSQ. - updateStructures(); - #if FULL_SYSTEM checkInterrupts(); @@ -740,6 +750,10 @@ LWBackEnd::dispatchInsts() while (numInsts < numROBEntries && numWaitingMemOps < maxOutstandingMemOps) { // Get instruction from front of time buffer + if (lsqLimits && LSQ.isFull()) { + break; + } + DynInstPtr inst = frontEnd->getInst(); if (!inst) { break; @@ -798,6 +812,7 @@ LWBackEnd::dispatchInsts() inst->setIssued(); inst->setExecuted(); inst->setCanCommit(); + numInstsToWB[0]++; } else { DPRINTF(BE, "Instruction [sn:%lli] ready, addding to " "exeList.\n", @@ -987,16 +1002,10 @@ template void LWBackEnd::instToCommit(DynInstPtr &inst) { - DPRINTF(BE, "Sending instructions to commit [sn:%lli] PC %#x.\n", inst->seqNum, inst->readPC()); if (!inst->isSquashed()) { - DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", - inst->seqNum, inst->readPC()); - - inst->setCanCommit(); - if (inst->isExecuted()) { inst->setResultReady(); int dependents = wakeDependents(inst); @@ -1007,8 +1016,32 @@ LWBackEnd::instToCommit(DynInstPtr &inst) } } + writeback.push_back(inst); + + numInstsToWB[0]++; + writebackCount[0]++; } + +template +void +LWBackEnd::readyInstsForCommit() +{ + for (int i = numInstsToWB[-latency]; + !writeback.empty() && i; + --i) + { + DynInstPtr inst = writeback.front(); + writeback.pop_front(); + if (!inst->isSquashed()) { + DPRINTF(BE, "Writing back instruction [sn:%lli] PC %#x.\n", + inst->seqNum, inst->readPC()); + + inst->setCanCommit(); + } + } +} + #if 0 template void @@ -1221,6 +1254,20 @@ LWBackEnd::commitInst(int inst_num) ++freed_regs; } +#if FULL_SYSTEM + if (thread->profile) { +// bool usermode = +// (xc->readMiscReg(AlphaISA::IPR_DTB_CM) & 0x18) != 0; +// thread->profilePC = usermode ? 1 : inst->readPC(); + thread->profilePC = inst->readPC(); + ProfileNode *node = thread->profile->consume(thread->getXCProxy(), + inst->staticInst); + + if (node) + thread->profileNode = node; + } +#endif + if (inst->traceData) { inst->traceData->setFetchSeq(inst->seqNum); inst->traceData->setCPSeq(thread->numInst); @@ -1280,9 +1327,9 @@ LWBackEnd::commitInsts() while (!instList.empty() && inst_num < commitWidth) { if (instList.back()->isSquashed()) { instList.back()->clearDependents(); + ROBSquashedInsts[instList.back()->threadNumber]++; instList.pop_back(); --numInsts; - ROBSquashedInsts[instList.back()->threadNumber]++; continue; } @@ -1304,10 +1351,10 @@ LWBackEnd::squash(const InstSeqNum &sn) LSQ.squash(sn); int freed_regs = 0; - InstListIt waiting_list_end = waitingList.end(); + InstListIt insts_end_it = waitingList.end(); InstListIt insts_it = waitingList.begin(); - while (insts_it != waiting_list_end && (*insts_it)->seqNum > sn) + while (insts_it != insts_end_it && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { ++insts_it; @@ -1333,6 +1380,7 @@ LWBackEnd::squash(const InstSeqNum &sn) while (!instList.empty() && (*insts_it)->seqNum > sn) { if ((*insts_it)->isSquashed()) { + panic("Instruction should not be already squashed and on list!"); ++insts_it; continue; } @@ -1364,18 +1412,6 @@ LWBackEnd::squash(const InstSeqNum &sn) --numInsts; } - insts_it = waitingList.begin(); - while (!waitingList.empty() && insts_it != waitingList.end()) { - if ((*insts_it)->seqNum < sn) { - ++insts_it; - continue; - } - assert((*insts_it)->isSquashed()); - - waitingList.erase(insts_it++); - waitingInsts--; - } - while (memBarrier && memBarrier->seqNum > sn) { DPRINTF(BE, "[sn:%lli] Memory barrier squashed (or previously " "squashed)\n", memBarrier->seqNum); @@ -1393,6 +1429,18 @@ LWBackEnd::squash(const InstSeqNum &sn) } } + insts_it = replayList.begin(); + insts_end_it = replayList.end(); + while (!replayList.empty() && insts_it != insts_end_it) { + if ((*insts_it)->seqNum < sn) { + ++insts_it; + continue; + } + assert((*insts_it)->isSquashed()); + + replayList.erase(insts_it++); + } + frontEnd->addFreeRegs(freed_regs); } @@ -1463,14 +1511,6 @@ LWBackEnd::squashDueToMemBlocked(DynInstPtr &inst) frontEnd->squash(inst->seqNum - 1, inst->readPC()); } -template -void -LWBackEnd::fetchFault(Fault &fault) -{ - faultFromFetch = fault; - fetchHasFault = true; -} - template void LWBackEnd::switchOut() @@ -1489,16 +1529,25 @@ LWBackEnd::doSwitchOut() // yet written back. assert(robEmpty()); assert(!LSQ.hasStoresToWB()); + writeback.clear(); + for (int i = 0; i < numInstsToWB.getSize() + 1; ++i) + numInstsToWB.advance(); +// squash(0); + assert(waitingList.empty()); + assert(instList.empty()); + assert(replayList.empty()); + assert(writeback.empty()); LSQ.switchOut(); - - squash(0); } template void LWBackEnd::takeOverFrom(ExecContext *old_xc) { + assert(!squashPending); + squashSeqNum = 0; + squashNextPC = 0; xcSquash = false; trapSquash = false; @@ -1641,6 +1690,45 @@ LWBackEnd::dumpInsts() ++num; } + inst_list_it = --(writeback.end()); + + cprintf("Writeback list size: %i\n", writeback.size()); + + while (inst_list_it != writeback.end()) + { + cprintf("Instruction:%i\n", + num); + if (!(*inst_list_it)->isSquashed()) { + if (!(*inst_list_it)->isIssued()) { + ++valid_num; + cprintf("Count:%i\n", valid_num); + } else if ((*inst_list_it)->isMemRef() && + !(*inst_list_it)->memOpDone) { + // Loads that have not been marked as executed still count + // towards the total instructions. + ++valid_num; + cprintf("Count:%i\n", valid_num); + } + } + + cprintf("PC:%#x\n[sn:%lli]\n[tid:%i]\n" + "Issued:%i\nSquashed:%i\n", + (*inst_list_it)->readPC(), + (*inst_list_it)->seqNum, + (*inst_list_it)->threadNumber, + (*inst_list_it)->isIssued(), + (*inst_list_it)->isSquashed()); + + if ((*inst_list_it)->isMemRef()) { + cprintf("MemOpDone:%i\n", (*inst_list_it)->memOpDone); + } + + cprintf("\n"); + + inst_list_it--; + ++num; + } + cprintf("Waiting list size: %i\n", waitingList.size()); inst_list_it = --(waitingList.end()); diff --git a/cpu/ozone/lw_lsq.hh b/cpu/ozone/lw_lsq.hh index c0bf0b0fe..07fd1aec5 100644 --- a/cpu/ozone/lw_lsq.hh +++ b/cpu/ozone/lw_lsq.hh @@ -110,6 +110,8 @@ class OzoneLWLSQ { /** Returns the name of the LSQ unit. */ std::string name() const; + void regStats(); + /** Sets the CPU pointer. */ void setCPU(FullCPU *cpu_ptr) { cpu = cpu_ptr; } @@ -203,7 +205,7 @@ class OzoneLWLSQ { int numLoads() { return loads; } /** Returns the number of stores in the SQ. */ - int numStores() { return stores; } + int numStores() { return stores + storesInFlight; } /** Returns if either the LQ or SQ is full. */ bool isFull() { return lqFull() || sqFull(); } @@ -212,7 +214,7 @@ class OzoneLWLSQ { bool lqFull() { return loads >= (LQEntries - 1); } /** Returns if the SQ is full. */ - bool sqFull() { return stores >= (SQEntries - 1); } + bool sqFull() { return (stores + storesInFlight) >= (SQEntries - 1); } /** Debugging function to dump instructions in the LSQ. */ void dumpInsts(); @@ -241,7 +243,9 @@ class OzoneLWLSQ { private: /** Completes the store at the specified index. */ - void completeStore(int store_idx); + void completeStore(DynInstPtr &inst); + + void removeStore(int store_idx); private: /** Pointer to the CPU. */ @@ -342,6 +346,10 @@ class OzoneLWLSQ { int storesToWB; + public: + int storesInFlight; + + private: /// @todo Consider moving to a more advanced model with write vs read ports /** The number of cache ports available each cycle. */ int cachePorts; @@ -351,6 +359,9 @@ class OzoneLWLSQ { //list mshrSeqNums; + /** Tota number of memory ordering violations. */ + Stats::Scalar<> lsqMemOrderViolation; + //Stats::Scalar<> dcacheStallCycles; Counter lastDcacheStall; diff --git a/cpu/ozone/lw_lsq_impl.hh b/cpu/ozone/lw_lsq_impl.hh index f72bbb1cc..c60884fc3 100644 --- a/cpu/ozone/lw_lsq_impl.hh +++ b/cpu/ozone/lw_lsq_impl.hh @@ -57,6 +57,7 @@ OzoneLWLSQ::StoreCompletionEvent::process() // lsqPtr->cpu->wakeCPU(); if (lsqPtr->isSwitchedOut()) { + panic("Should not be switched out!"); if (wbEvent) delete wbEvent; @@ -68,7 +69,11 @@ OzoneLWLSQ::StoreCompletionEvent::process() delete wbEvent; } - lsqPtr->completeStore(inst->sqIdx); + lsqPtr->completeStore(inst); + lsqPtr->removeStore(inst->sqIdx); + --(lsqPtr->storesInFlight); + + DPRINTF(OzoneLSQ, "StoresInFlight: %i\n", lsqPtr->storesInFlight); if (miss) be->removeDcacheMiss(inst); } @@ -82,7 +87,7 @@ OzoneLWLSQ::StoreCompletionEvent::description() template OzoneLWLSQ::OzoneLWLSQ() - : loads(0), stores(0), storesToWB(0), stalled(false), isLoadBlocked(false), + : loads(0), stores(0), storesToWB(0), storesInFlight(0), stalled(false), isLoadBlocked(false), loadBlockedHandled(false) { } @@ -121,6 +126,15 @@ OzoneLWLSQ::name() const return "lsqunit"; } +template +void +OzoneLWLSQ::regStats() +{ + lsqMemOrderViolation + .name(name() + ".memOrderViolation") + .desc("Number of memory ordering violations"); +} + template void OzoneLWLSQ::clearLQ() @@ -257,7 +271,7 @@ unsigned OzoneLWLSQ::numFreeEntries() { unsigned free_lq_entries = LQEntries - loads; - unsigned free_sq_entries = SQEntries - stores; + unsigned free_sq_entries = SQEntries - (stores + storesInFlight); // Both the LQ and SQ entries have an extra dummy entry to differentiate // empty/full conditions. Subtract 1 from the free entries. @@ -397,6 +411,7 @@ OzoneLWLSQ::executeStore(DynInstPtr &store_inst) // A load incorrectly passed this store. Squash and refetch. // For now return a fault to show that it was unsuccessful. memDepViolator = (*lq_it); + ++lsqMemOrderViolation; return TheISA::genMachineCheckFault(); } @@ -483,8 +498,8 @@ OzoneLWLSQ::writebackStores() if ((*sq_it).size == 0 && !(*sq_it).completed) { sq_it--; - completeStore(inst->sqIdx); - + removeStore(inst->sqIdx); + completeStore(inst); continue; } @@ -540,6 +555,8 @@ OzoneLWLSQ::writebackStores() inst->sqIdx,inst->readPC(), req->paddr, *(req->data), inst->seqNum); + DPRINTF(OzoneLSQ, "StoresInFlight: %i\n", + storesInFlight + 1); if (dcacheInterface) { assert(!req->completionEvent); @@ -601,6 +618,8 @@ OzoneLWLSQ::writebackStores() } sq_it--; } + ++storesInFlight; +// removeStore(inst->sqIdx); } else { panic("Must HAVE DCACHE!!!!!\n"); } @@ -617,7 +636,7 @@ void OzoneLWLSQ::squash(const InstSeqNum &squashed_num) { DPRINTF(OzoneLSQ, "Squashing until [sn:%lli]!" - "(Loads:%i Stores:%i)\n",squashed_num,loads,stores); + "(Loads:%i Stores:%i)\n",squashed_num,loads,stores+storesInFlight); LQIt lq_it = loadQueue.begin(); @@ -732,7 +751,7 @@ OzoneLWLSQ::dumpInsts() template void -OzoneLWLSQ::completeStore(int store_idx) +OzoneLWLSQ::removeStore(int store_idx) { SQHashIt sq_hash_it = SQItHash.find(store_idx); assert(sq_hash_it != SQItHash.end()); @@ -742,8 +761,6 @@ OzoneLWLSQ::completeStore(int store_idx) (*sq_it).completed = true; DynInstPtr inst = (*sq_it).inst; - --storesToWB; - if (isStalled() && inst->seqNum == stallingStoreIsn) { DPRINTF(OzoneLSQ, "Unstalling, stalling store [sn:%lli] " @@ -761,6 +778,13 @@ OzoneLWLSQ::completeStore(int store_idx) SQItHash.erase(sq_hash_it); SQIndices.push(inst->sqIdx); storeQueue.erase(sq_it); +} + +template +void +OzoneLWLSQ::completeStore(DynInstPtr &inst) +{ + --storesToWB; --stores; inst->setCompleted(); @@ -839,9 +863,14 @@ OzoneLWLSQ::switchOut() } // Clear the queue to free up resources + assert(stores == 0); + assert(storeQueue.empty()); + assert(loads == 0); + assert(loadQueue.empty()); + assert(storesInFlight == 0); storeQueue.clear(); loadQueue.clear(); - loads = stores = storesToWB = 0; + loads = stores = storesToWB = storesInFlight = 0; } template diff --git a/cpu/ozone/simple_params.hh b/cpu/ozone/simple_params.hh index 7b5c6f67b..d28d040f8 100644 --- a/cpu/ozone/simple_params.hh +++ b/cpu/ozone/simple_params.hh @@ -70,10 +70,11 @@ class SimpleParams : public BaseCPU::Params unsigned cachePorts; unsigned width; + unsigned frontEndLatency; unsigned frontEndWidth; + unsigned backEndLatency; unsigned backEndWidth; unsigned backEndSquashLatency; - unsigned backEndLatency; unsigned maxInstBufferSize; unsigned numPhysicalRegs; unsigned maxOutstandingMemOps; @@ -149,6 +150,7 @@ class SimpleParams : public BaseCPU::Params // unsigned LQEntries; unsigned SQEntries; + bool lsqLimits; // // Memory dependence