diff --git a/src/arch/sparc/isa/operands.isa b/src/arch/sparc/isa/operands.isa index 58d616a7a..110b37d15 100644 --- a/src/arch/sparc/isa/operands.isa +++ b/src/arch/sparc/isa/operands.isa @@ -187,7 +187,7 @@ def operands {{ 'Hver': ('ControlReg', 'udw', 'MISCREG_HVER', None, 74), 'StrandStsReg': ('ControlReg', 'udw', 'MISCREG_STRAND_STS_REG', None, 75), - 'Fsr': ('ControlReg', 'udw', 'MISCREG_FSR', None, 80), + 'Fsr': ('ControlReg', 'udw', 'MISCREG_FSR', (None, None, ['IsSerializeAfter','IsSerializing','IsNonSpeculative']), 80), # Mem gets a large number so it's always last 'Mem': ('Mem', 'udw', None, ('IsMemRef', 'IsLoad', 'IsStore'), 100) diff --git a/src/arch/sparc/types.hh b/src/arch/sparc/types.hh index 15386adca..8bd50b7e8 100644 --- a/src/arch/sparc/types.hh +++ b/src/arch/sparc/types.hh @@ -59,7 +59,7 @@ namespace SparcISA typedef int RegContextVal; - typedef uint8_t RegIndex; + typedef uint16_t RegIndex; } #endif diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index eed05c2f1..1311e5cf2 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -209,6 +209,9 @@ class BaseDynInst : public FastAlloc, public RefCounted /** PC of this instruction. */ Addr PC; + /** Micro PC of this instruction. */ + Addr microPC; + protected: /** Next non-speculative PC. It is not filled in at fetch, but rather * once the target of the branch is truly known (either decode or @@ -219,12 +222,18 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Next non-speculative NPC. Target PC for Mips or Sparc. */ Addr nextNPC; + /** Next non-speculative micro PC. */ + Addr nextMicroPC; + /** Predicted next PC. */ Addr predPC; /** Predicted next NPC. */ Addr predNPC; + /** Predicted next microPC */ + Addr predMicroPC; + /** If this is a branch that was predicted taken */ bool predTaken; @@ -340,6 +349,17 @@ class BaseDynInst : public FastAlloc, public RefCounted { _flatDestRegIdx[idx] = flattened_dest; } + /** BaseDynInst constructor given a binary instruction. + * @param staticInst A StaticInstPtr to the underlying instruction. + * @param PC The PC of the instruction. + * @param pred_PC The predicted next PC. + * @param pred_NPC The predicted next NPC. + * @param seq_num The sequence number of the instruction. + * @param cpu Pointer to the instruction's CPU. + */ + BaseDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC, + Addr pred_PC, Addr pred_NPC, Addr pred_MicroPC, + InstSeqNum seq_num, ImplCPU *cpu); /** BaseDynInst constructor given a binary instruction. * @param inst The binary instruction. @@ -349,8 +369,8 @@ class BaseDynInst : public FastAlloc, public RefCounted * @param seq_num The sequence number of the instruction. * @param cpu Pointer to the instruction's CPU. */ - BaseDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, - Addr pred_PC, Addr pred_NPC, + BaseDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, Addr microPC, + Addr pred_PC, Addr pred_NPC, Addr pred_MicroPC, InstSeqNum seq_num, ImplCPU *cpu); /** BaseDynInst constructor given a StaticInst pointer. @@ -402,11 +422,18 @@ class BaseDynInst : public FastAlloc, public RefCounted #endif } + Addr readNextMicroPC() + { + return nextMicroPC; + } + /** Set the predicted target of this current instruction. */ - void setPredTarg(Addr predicted_PC, Addr predicted_NPC) + void setPredTarg(Addr predicted_PC, Addr predicted_NPC, + Addr predicted_MicroPC) { predPC = predicted_PC; predNPC = predicted_NPC; + predMicroPC = predicted_MicroPC; } /** Returns the predicted PC immediately after the branch. */ @@ -415,6 +442,9 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Returns the predicted PC two instructions after the branch */ Addr readPredNPC() { return predNPC; } + /** Returns the predicted micro PC after the branch */ + Addr readPredMicroPC() { return predMicroPC; } + /** Returns whether the instruction was predicted taken or not. */ bool readPredTaken() { @@ -430,7 +460,8 @@ class BaseDynInst : public FastAlloc, public RefCounted bool mispredicted() { return readPredPC() != readNextPC() || - readPredNPC() != readNextNPC(); + readPredNPC() != readNextNPC() || + readPredMicroPC() != readNextMicroPC(); } // @@ -467,6 +498,12 @@ class BaseDynInst : public FastAlloc, public RefCounted bool isQuiesce() const { return staticInst->isQuiesce(); } bool isIprAccess() const { return staticInst->isIprAccess(); } bool isUnverifiable() const { return staticInst->isUnverifiable(); } + bool isMacroOp() const { return staticInst->isMacroOp(); } + bool isMicroOp() const { return staticInst->isMicroOp(); } + bool isDelayedCommit() const { return staticInst->isDelayedCommit(); } + bool isLastMicroOp() const { return staticInst->isLastMicroOp(); } + bool isFirstMicroOp() const { return staticInst->isFirstMicroOp(); } + bool isMicroBranch() const { return staticInst->isMicroBranch(); } /** Temporarily sets this instruction as a serialize before instruction. */ void setSerializeBefore() { status.set(SerializeBefore); } @@ -700,16 +737,26 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Read the PC of this instruction. */ const Addr readPC() const { return PC; } + /**Read the micro PC of this instruction. */ + const Addr readMicroPC() const { return microPC; } + /** Set the next PC of this instruction (its actual target). */ - void setNextPC(uint64_t val) + void setNextPC(Addr val) { nextPC = val; } /** Set the next NPC of this instruction (the target in Mips or Sparc).*/ - void setNextNPC(uint64_t val) + void setNextNPC(Addr val) { +#if ISA_HAS_DELAY_SLOT nextNPC = val; +#endif + } + + void setNextMicroPC(Addr val) + { + nextMicroPC = val; } /** Sets the ASID. */ diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh index a1c866336..acf8af9cf 100644 --- a/src/cpu/base_dyn_inst_impl.hh +++ b/src/cpu/base_dyn_inst_impl.hh @@ -62,19 +62,66 @@ my_hash_t thishash; #endif template -BaseDynInst::BaseDynInst(TheISA::ExtMachInst machInst, +BaseDynInst::BaseDynInst(StaticInstPtr _staticInst, Addr inst_PC, Addr inst_NPC, + Addr inst_MicroPC, Addr pred_PC, Addr pred_NPC, + Addr pred_MicroPC, InstSeqNum seq_num, ImplCPU *cpu) - : staticInst(machInst), traceData(NULL), cpu(cpu) + : staticInst(_staticInst), traceData(NULL), cpu(cpu) { seqNum = seq_num; + bool nextIsMicro = + staticInst->isMicroOp() && !staticInst->isLastMicroOp(); + PC = inst_PC; - nextPC = inst_NPC; - nextNPC = nextPC + sizeof(TheISA::MachInst); + microPC = inst_MicroPC; + if (nextIsMicro) { + nextPC = inst_PC; + nextNPC = inst_NPC; + nextMicroPC = microPC + 1; + } else { + nextPC = inst_NPC; + nextNPC = nextPC + sizeof(TheISA::MachInst); + nextMicroPC = 0; + } predPC = pred_PC; predNPC = pred_NPC; + predMicroPC = pred_MicroPC; + predTaken = false; + + initVars(); +} + +template +BaseDynInst::BaseDynInst(TheISA::ExtMachInst inst, + Addr inst_PC, Addr inst_NPC, + Addr inst_MicroPC, + Addr pred_PC, Addr pred_NPC, + Addr pred_MicroPC, + InstSeqNum seq_num, ImplCPU *cpu) + : staticInst(inst), traceData(NULL), cpu(cpu) +{ + seqNum = seq_num; + + bool nextIsMicro = + staticInst->isMicroOp() && !staticInst->isLastMicroOp(); + + PC = inst_PC; + microPC = inst_MicroPC; + if (nextIsMicro) { + nextPC = inst_PC; + nextNPC = inst_NPC; + nextMicroPC = microPC + 1; + } else { + nextPC = inst_NPC; + nextNPC = nextPC + sizeof(TheISA::MachInst); + nextMicroPC = 0; + } + predPC = pred_PC; + predNPC = pred_NPC; + predMicroPC = pred_MicroPC; predTaken = false; initVars(); diff --git a/src/cpu/o3/alpha/dyn_inst.hh b/src/cpu/o3/alpha/dyn_inst.hh index 20759d849..a6fb7b885 100644 --- a/src/cpu/o3/alpha/dyn_inst.hh +++ b/src/cpu/o3/alpha/dyn_inst.hh @@ -73,8 +73,13 @@ class AlphaDynInst : public BaseDynInst public: /** BaseDynInst constructor given a binary instruction. */ - AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, + AlphaDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); + + /** BaseDynInst constructor given a binary instruction. */ + AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu); /** BaseDynInst constructor given a static inst pointer. */ diff --git a/src/cpu/o3/alpha/dyn_inst_impl.hh b/src/cpu/o3/alpha/dyn_inst_impl.hh index fdce1ade5..6dfe0ccdd 100644 --- a/src/cpu/o3/alpha/dyn_inst_impl.hh +++ b/src/cpu/o3/alpha/dyn_inst_impl.hh @@ -31,10 +31,25 @@ #include "cpu/o3/alpha/dyn_inst.hh" template -AlphaDynInst::AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, +AlphaDynInst::AlphaDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, Addr Pred_PC, Addr Pred_NPC, + Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu) - : BaseDynInst(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu) + : BaseDynInst(staticInst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) +{ + initVars(); +} + +template +AlphaDynInst::AlphaDynInst(ExtMachInst inst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, + Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu) + : BaseDynInst(inst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) { initVars(); } diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index d96919007..fb772060b 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -87,10 +87,10 @@ struct DefaultIEWDefaultCommit { bool squash[Impl::MaxThreads]; bool branchMispredict[Impl::MaxThreads]; bool branchTaken[Impl::MaxThreads]; - bool squashDelaySlot[Impl::MaxThreads]; - uint64_t mispredPC[Impl::MaxThreads]; - uint64_t nextPC[Impl::MaxThreads]; - uint64_t nextNPC[Impl::MaxThreads]; + Addr mispredPC[Impl::MaxThreads]; + Addr nextPC[Impl::MaxThreads]; + Addr nextNPC[Impl::MaxThreads]; + Addr nextMicroPC[Impl::MaxThreads]; InstSeqNum squashedSeqNum[Impl::MaxThreads]; bool includeSquashInst[Impl::MaxThreads]; @@ -114,15 +114,15 @@ struct TimeBufStruct { uint64_t branchAddr; InstSeqNum doneSeqNum; - InstSeqNum bdelayDoneSeqNum; // @todo: Might want to package this kind of branch stuff into a single // struct as it is used pretty frequently. bool branchMispredict; bool branchTaken; - uint64_t mispredPC; - uint64_t nextPC; - uint64_t nextNPC; + Addr mispredPC; + Addr nextPC; + Addr nextNPC; + Addr nextMicroPC; unsigned branchCount; }; @@ -160,18 +160,16 @@ struct TimeBufStruct { bool branchMispredict; bool branchTaken; - uint64_t mispredPC; - uint64_t nextPC; - uint64_t nextNPC; + Addr mispredPC; + Addr nextPC; + Addr nextNPC; + Addr nextMicroPC; // Represents the instruction that has either been retired or // squashed. Similar to having a single bus that broadcasts the // retired or squashed sequence number. InstSeqNum doneSeqNum; - InstSeqNum bdelayDoneSeqNum; - bool squashDelaySlot; - //Just in case we want to do a commit/squash on a cycle //(necessary for multiple ROBs?) bool commitInsts; diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index fba618c14..27bdd20c5 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -279,25 +279,37 @@ class DefaultCommit /** Returns the PC of the head instruction of the ROB. * @todo: Probably remove this function as it returns only thread 0. */ - uint64_t readPC() { return PC[0]; } + Addr readPC() { return PC[0]; } /** Returns the PC of a specific thread. */ - uint64_t readPC(unsigned tid) { return PC[tid]; } + Addr readPC(unsigned tid) { return PC[tid]; } /** Sets the PC of a specific thread. */ - void setPC(uint64_t val, unsigned tid) { PC[tid] = val; } + void setPC(Addr val, unsigned tid) { PC[tid] = val; } + + /** Reads the micro PC of a specific thread. */ + Addr readMicroPC(unsigned tid) { return microPC[tid]; } + + /** Sets the micro PC of a specific thread */ + void setMicroPC(Addr val, unsigned tid) { microPC[tid] = val; } /** Reads the next PC of a specific thread. */ - uint64_t readNextPC(unsigned tid) { return nextPC[tid]; } + Addr readNextPC(unsigned tid) { return nextPC[tid]; } /** Sets the next PC of a specific thread. */ - void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; } + void setNextPC(Addr val, unsigned tid) { nextPC[tid] = val; } /** Reads the next NPC of a specific thread. */ - uint64_t readNextNPC(unsigned tid) { return nextNPC[tid]; } + Addr readNextNPC(unsigned tid) { return nextNPC[tid]; } /** Sets the next NPC of a specific thread. */ - void setNextNPC(uint64_t val, unsigned tid) { nextNPC[tid] = val; } + void setNextNPC(Addr val, unsigned tid) { nextNPC[tid] = val; } + + /** Reads the micro PC of a specific thread. */ + Addr readNextMicroPC(unsigned tid) { return nextMicroPC[tid]; } + + /** Sets the micro PC of a specific thread */ + void setNextMicroPC(Addr val, unsigned tid) { nextMicroPC[tid] = val; } private: /** Time buffer interface. */ @@ -402,12 +414,20 @@ class DefaultCommit */ Addr PC[Impl::MaxThreads]; + /** The commit micro PC of each thread. Refers to the instruction that + * is currently being processed/committed. + */ + Addr microPC[Impl::MaxThreads]; + /** The next PC of each thread. */ Addr nextPC[Impl::MaxThreads]; /** The next NPC of each thread. */ Addr nextNPC[Impl::MaxThreads]; + /** The next micro PC of each thread. */ + Addr nextMicroPC[Impl::MaxThreads]; + /** The sequence number of the youngest valid instruction in the ROB. */ InstSeqNum youngestSeqNum[Impl::MaxThreads]; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 65625065d..fc24d7edc 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -124,7 +124,7 @@ DefaultCommit::DefaultCommit(O3CPU *_cpu, Params *params) committedStores[i] = false; trapSquash[i] = false; tcSquash[i] = false; - PC[i] = nextPC[i] = nextNPC[i] = 0; + microPC[i] = nextMicroPC[i] = PC[i] = nextPC[i] = nextNPC[i] = 0; } #if FULL_SYSTEM interrupt = NoFault; @@ -508,6 +508,7 @@ DefaultCommit::squashAll(unsigned tid) toIEW->commitInfo[tid].nextPC = PC[tid]; toIEW->commitInfo[tid].nextNPC = nextPC[tid]; + toIEW->commitInfo[tid].nextMicroPC = nextMicroPC[tid]; } template @@ -741,38 +742,15 @@ DefaultCommit::commit() // then use one older sequence number. InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid]; -#if ISA_HAS_DELAY_SLOT - InstSeqNum bdelay_done_seq_num = squashed_inst; - bool squash_bdelay_slot = fromIEW->squashDelaySlot[tid]; - bool branchMispredict = fromIEW->branchMispredict[tid]; - - // Squashing/not squashing the branch delay slot only makes - // sense when you're squashing from a branch, ie from a branch - // mispredict. - if (branchMispredict && !squash_bdelay_slot) { - bdelay_done_seq_num++; - } -#endif - if (fromIEW->includeSquashInst[tid] == true) { squashed_inst--; -#if ISA_HAS_DELAY_SLOT - bdelay_done_seq_num--; -#endif } // All younger instructions will be squashed. Set the sequence // number as the youngest instruction in the ROB. youngestSeqNum[tid] = squashed_inst; -#if ISA_HAS_DELAY_SLOT - rob->squash(bdelay_done_seq_num, tid); - toIEW->commitInfo[tid].squashDelaySlot = squash_bdelay_slot; - toIEW->commitInfo[tid].bdelayDoneSeqNum = bdelay_done_seq_num; -#else rob->squash(squashed_inst, tid); - toIEW->commitInfo[tid].squashDelaySlot = true; -#endif changedROBNumEntries[tid] = true; toIEW->commitInfo[tid].doneSeqNum = squashed_inst; @@ -791,6 +769,7 @@ DefaultCommit::commit() toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid]; toIEW->commitInfo[tid].nextNPC = fromIEW->nextNPC[tid]; + toIEW->commitInfo[tid].nextMicroPC = fromIEW->nextMicroPC[tid]; toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid]; @@ -809,10 +788,6 @@ DefaultCommit::commit() // Try to commit any instructions. commitInsts(); - } else { -#if ISA_HAS_DELAY_SLOT - skidInsert(); -#endif } //Check for any activity @@ -904,6 +879,7 @@ DefaultCommit::commitInsts() PC[tid] = head_inst->readPC(); nextPC[tid] = head_inst->readNextPC(); nextNPC[tid] = head_inst->readNextNPC(); + nextMicroPC[tid] = head_inst->readNextMicroPC(); // Increment the total number of non-speculative instructions // executed. @@ -932,12 +908,10 @@ DefaultCommit::commitInsts() } PC[tid] = nextPC[tid]; -#if ISA_HAS_DELAY_SLOT nextPC[tid] = nextNPC[tid]; nextNPC[tid] = nextNPC[tid] + sizeof(TheISA::MachInst); -#else - nextPC[tid] = nextPC[tid] + sizeof(TheISA::MachInst); -#endif + microPC[tid] = nextMicroPC[tid]; + nextMicroPC[tid] = microPC[tid] + 1; #if FULL_SYSTEM int count = 0; @@ -1164,37 +1138,13 @@ DefaultCommit::getInsts() { DPRINTF(Commit, "Getting instructions from Rename stage.\n"); -#if ISA_HAS_DELAY_SLOT - // Read any renamed instructions and place them into the ROB. - int insts_to_process = std::min((int)renameWidth, - (int)(fromRename->size + skidBuffer.size())); - int rename_idx = 0; - - DPRINTF(Commit, "%i insts available to process. Rename Insts:%i " - "SkidBuffer Insts:%i\n", insts_to_process, fromRename->size, - skidBuffer.size()); -#else // Read any renamed instructions and place them into the ROB. int insts_to_process = std::min((int)renameWidth, fromRename->size); -#endif - for (int inst_num = 0; inst_num < insts_to_process; ++inst_num) { DynInstPtr inst; -#if ISA_HAS_DELAY_SLOT - // Get insts from skidBuffer or from Rename - if (skidBuffer.size() > 0) { - DPRINTF(Commit, "Grabbing skidbuffer inst.\n"); - inst = skidBuffer.front(); - skidBuffer.pop(); - } else { - DPRINTF(Commit, "Grabbing rename inst.\n"); - inst = fromRename->insts[rename_idx++]; - } -#else inst = fromRename->insts[inst_num]; -#endif int tid = inst->threadNumber; if (!inst->isSquashed() && @@ -1216,30 +1166,6 @@ DefaultCommit::getInsts() inst->readPC(), inst->seqNum, tid); } } - -#if ISA_HAS_DELAY_SLOT - if (rename_idx < fromRename->size) { - DPRINTF(Commit,"Placing Rename Insts into skidBuffer.\n"); - - for (; - rename_idx < fromRename->size; - rename_idx++) { - DynInstPtr inst = fromRename->insts[rename_idx]; - - if (!inst->isSquashed()) { - DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ", - "skidBuffer.\n", inst->readPC(), inst->seqNum, - inst->threadNumber); - skidBuffer.push(inst); - } else { - DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was " - "squashed, skipping.\n", - inst->readPC(), inst->seqNum, inst->threadNumber); - } - } - } -#endif - } template diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 2e6a43f9c..59978a065 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -696,7 +696,7 @@ FullO3CPU::removeThread(unsigned tid) // Squash Throughout Pipeline InstSeqNum squash_seq_num = commit.rob->readHeadInst(tid)->seqNum; - fetch.squash(0, sizeof(TheISA::MachInst), squash_seq_num, true, tid); + fetch.squash(0, sizeof(TheISA::MachInst), 0, squash_seq_num, tid); decode.squash(tid); rename.squash(squash_seq_num, tid); iew.squash(tid); @@ -1150,6 +1150,20 @@ FullO3CPU::setPC(Addr new_PC,unsigned tid) commit.setPC(new_PC, tid); } +template +uint64_t +FullO3CPU::readMicroPC(unsigned tid) +{ + return commit.readMicroPC(tid); +} + +template +void +FullO3CPU::setMicroPC(Addr new_PC,unsigned tid) +{ + commit.setMicroPC(new_PC, tid); +} + template uint64_t FullO3CPU::readNextPC(unsigned tid) @@ -1178,6 +1192,20 @@ FullO3CPU::setNextNPC(uint64_t val,unsigned tid) commit.setNextNPC(val, tid); } +template +uint64_t +FullO3CPU::readNextMicroPC(unsigned tid) +{ + return commit.readNextMicroPC(tid); +} + +template +void +FullO3CPU::setNextMicroPC(Addr new_PC,unsigned tid) +{ + commit.setNextMicroPC(new_PC, tid); +} + template typename FullO3CPU::ListIt FullO3CPU::addInst(DynInstPtr &inst) @@ -1226,9 +1254,7 @@ FullO3CPU::removeFrontInst(DynInstPtr &inst) template void -FullO3CPU::removeInstsNotInROB(unsigned tid, - bool squash_delay_slot, - const InstSeqNum &delay_slot_seq_num) +FullO3CPU::removeInstsNotInROB(unsigned tid) { DPRINTF(O3CPU, "Thread %i: Deleting instructions from instruction" " list.\n", tid); @@ -1259,12 +1285,6 @@ FullO3CPU::removeInstsNotInROB(unsigned tid, while (inst_it != end_it) { assert(!instList.empty()); -#if ISA_HAS_DELAY_SLOT - if(!squash_delay_slot && - delay_slot_seq_num >= (*inst_it)->seqNum) { - break; - } -#endif squashInstIt(inst_it, tid); inst_it--; diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index e71d05c8e..bff78bf9e 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -433,22 +433,34 @@ class FullO3CPU : public BaseO3CPU void setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid); /** Reads the commit PC of a specific thread. */ - uint64_t readPC(unsigned tid); + Addr readPC(unsigned tid); /** Sets the commit PC of a specific thread. */ void setPC(Addr new_PC, unsigned tid); + /** Reads the commit micro PC of a specific thread. */ + Addr readMicroPC(unsigned tid); + + /** Sets the commmit micro PC of a specific thread. */ + void setMicroPC(Addr new_microPC, unsigned tid); + /** Reads the next PC of a specific thread. */ - uint64_t readNextPC(unsigned tid); + Addr readNextPC(unsigned tid); /** Sets the next PC of a specific thread. */ - void setNextPC(uint64_t val, unsigned tid); + void setNextPC(Addr val, unsigned tid); /** Reads the next NPC of a specific thread. */ - uint64_t readNextNPC(unsigned tid); + Addr readNextNPC(unsigned tid); /** Sets the next NPC of a specific thread. */ - void setNextNPC(uint64_t val, unsigned tid); + void setNextNPC(Addr val, unsigned tid); + + /** Reads the commit next micro PC of a specific thread. */ + Addr readNextMicroPC(unsigned tid); + + /** Sets the commit next micro PC of a specific thread. */ + void setNextMicroPC(Addr val, unsigned tid); /** Function to add instruction onto the head of the list of the * instructions. Used when new instructions are fetched. @@ -468,8 +480,7 @@ class FullO3CPU : public BaseO3CPU /** Remove all instructions that are not currently in the ROB. * There's also an option to not squash delay slot instructions.*/ - void removeInstsNotInROB(unsigned tid, bool squash_delay_slot, - const InstSeqNum &delay_slot_seq_num); + void removeInstsNotInROB(unsigned tid); /** Remove all instructions younger than the given sequence number. */ void removeInstsUntil(const InstSeqNum &seq_num,unsigned tid); diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh index 314864f94..ce6738456 100644 --- a/src/cpu/o3/decode_impl.hh +++ b/src/cpu/o3/decode_impl.hh @@ -49,8 +49,6 @@ DefaultDecode::DefaultDecode(O3CPU *_cpu, Params *params) stalls[i].rename = false; stalls[i].iew = false; stalls[i].commit = false; - - squashAfterDelaySlot[i] = false; } // @todo: Make into a parameter @@ -275,20 +273,16 @@ DefaultDecode::squash(DynInstPtr &inst, unsigned tid) ///explicitly for ISAs with delay slots. toFetch->decodeInfo[tid].nextNPC = inst->branchTarget() + sizeof(TheISA::MachInst); + toFetch->decodeInfo[tid].nextMicroPC = inst->readMicroPC(); #if ISA_HAS_DELAY_SLOT toFetch->decodeInfo[tid].branchTaken = inst->readNextNPC() != (inst->readNextPC() + sizeof(TheISA::MachInst)); - - toFetch->decodeInfo[tid].bdelayDoneSeqNum = bdelayDoneSeqNum[tid]; - squashAfterDelaySlot[tid] = false; - - InstSeqNum squash_seq_num = bdelayDoneSeqNum[tid]; #else toFetch->decodeInfo[tid].branchTaken = inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst)); +#endif InstSeqNum squash_seq_num = inst->seqNum; -#endif // Might have to tell fetch to unblock. if (decodeStatus[tid] == Blocked || @@ -309,30 +303,10 @@ DefaultDecode::squash(DynInstPtr &inst, unsigned tid) // Clear the instruction list and skid buffer in case they have any // insts in them. while (!insts[tid].empty()) { - -#if ISA_HAS_DELAY_SLOT - if (insts[tid].front()->seqNum <= squash_seq_num) { - DPRINTF(Decode, "[tid:%i]: Cannot remove incoming decode " - "instructions before delay slot [sn:%i]. %i insts" - "left in decode.\n", tid, squash_seq_num, - insts[tid].size()); - break; - } -#endif insts[tid].pop(); } while (!skidBuffer[tid].empty()) { - -#if ISA_HAS_DELAY_SLOT - if (skidBuffer[tid].front()->seqNum <= squash_seq_num) { - DPRINTF(Decode, "[tid:%i]: Cannot remove skidBuffer " - "instructions before delay slot [sn:%i]. %i insts" - "left in decode.\n", tid, squash_seq_num, - insts[tid].size()); - break; - } -#endif skidBuffer[tid].pop(); } @@ -760,48 +734,13 @@ DefaultDecode::decodeInsts(unsigned tid) // Might want to set some sort of boolean and just do // a check at the end -#if !ISA_HAS_DELAY_SLOT squash(inst, inst->threadNumber); Addr target = inst->branchTarget(); - inst->setPredTarg(target, target + sizeof(TheISA::MachInst)); + //The micro pc after an instruction level branch should be 0 + inst->setPredTarg(target, target + sizeof(TheISA::MachInst), 0); break; -#else - // If mispredicted as taken, then ignore delay slot - // instruction... else keep delay slot and squash - // after it is sent to rename - if (inst->readPredTaken() && inst->isCondDelaySlot()) { - DPRINTF(Decode, "[tid:%i]: Conditional delay slot inst." - "[sn:%i] PC %#x mispredicted as taken.\n", tid, - inst->seqNum, inst->PC); - bdelayDoneSeqNum[tid] = inst->seqNum; - squash(inst, inst->threadNumber); - Addr target = inst->branchTarget(); - inst->setPredTarg(target, - target + sizeof(TheISA::MachInst)); - break; - } else { - DPRINTF(Decode, "[tid:%i]: Misprediction detected at " - "[sn:%i] PC %#x, will squash after delay slot " - "inst. is sent to Rename\n", - tid, inst->seqNum, inst->PC); - bdelayDoneSeqNum[tid] = inst->seqNum + 1; - squashAfterDelaySlot[tid] = true; - squashInst[tid] = inst; - continue; - } -#endif } } - - if (squashAfterDelaySlot[tid]) { - assert(!inst->isSquashed()); - squash(squashInst[tid], squashInst[tid]->threadNumber); - Addr target = squashInst[tid]->branchTarget(); - squashInst[tid]->setPredTarg(target, - target + sizeof(TheISA::MachInst)); - assert(!inst->isSquashed()); - break; - } } // If we didn't process all instructions, then we will need to block diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 241935416..7645a226c 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -227,7 +227,7 @@ class DefaultFetch * @param next_NPC Used for ISAs which use delay slots. * @return Whether or not a branch was predicted as taken. */ - bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, Addr &next_NPC); + bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, Addr &next_NPC, Addr &next_MicroPC); /** * Fetches the cache line that contains fetch_PC. Returns any @@ -242,12 +242,14 @@ class DefaultFetch bool fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid); /** Squashes a specific thread and resets the PC. */ - inline void doSquash(const Addr &new_PC, const Addr &new_NPC, unsigned tid); + inline void doSquash(const Addr &new_PC, const Addr &new_NPC, + const Addr &new_MicroPC, unsigned tid); /** Squashes a specific thread and resets the PC. Also tells the CPU to * remove any instructions between fetch and decode that should be sqaushed. */ void squashFromDecode(const Addr &new_PC, const Addr &new_NPC, + const Addr &new_MicroPC, const InstSeqNum &seq_num, unsigned tid); /** Checks if a thread is stalled. */ @@ -263,8 +265,8 @@ class DefaultFetch * squash should be the commit stage. */ void squash(const Addr &new_PC, const Addr &new_NPC, - const InstSeqNum &seq_num, - bool squash_delay_slot, unsigned tid); + const Addr &new_MicroPC, + const InstSeqNum &seq_num, unsigned tid); /** Ticks the fetch stage, processing all inputs signals and fetching * as many instructions as possible. @@ -347,16 +349,12 @@ class DefaultFetch /** Per-thread fetch PC. */ Addr PC[Impl::MaxThreads]; + /** Per-thread fetch micro PC. */ + Addr microPC[Impl::MaxThreads]; + /** Per-thread next PC. */ Addr nextPC[Impl::MaxThreads]; - /** Per-thread next Next PC. - * This is not a real register but is used for - * architectures that use a branch-delay slot. - * (such as MIPS or Sparc) - */ - Addr nextNPC[Impl::MaxThreads]; - /** Memory request used to access cache. */ RequestPtr memReq[Impl::MaxThreads]; diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index e16f97558..3ae7bc402 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -312,7 +312,7 @@ DefaultFetch::initStage() for (int tid = 0; tid < numThreads; tid++) { PC[tid] = cpu->readPC(tid); nextPC[tid] = cpu->readNextPC(tid); - nextNPC[tid] = cpu->readNextNPC(tid); + microPC[tid] = cpu->readMicroPC(tid); } for (int tid=0; tid < numThreads; tid++) { @@ -439,11 +439,7 @@ DefaultFetch::takeOverFrom() stalls[i].commit = 0; PC[i] = cpu->readPC(i); nextPC[i] = cpu->readNextPC(i); -#if ISA_HAS_DELAY_SLOT - nextNPC[i] = cpu->readNextNPC(i); -#else - nextNPC[i] = nextPC[i] + sizeof(TheISA::MachInst); -#endif + microPC[i] = cpu->readMicroPC(i); fetchStatus[i] = Running; } numInst = 0; @@ -493,7 +489,7 @@ DefaultFetch::switchToInactive() template bool DefaultFetch::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, - Addr &next_NPC) + Addr &next_NPC, Addr &next_MicroPC) { // Do branch prediction check here. // A bit of a misnomer...next_PC is actually the current PC until @@ -501,13 +497,22 @@ DefaultFetch::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, bool predict_taken; if (!inst->isControl()) { - next_PC = next_NPC; - next_NPC = next_NPC + instSize; - inst->setPredTarg(next_PC, next_NPC); + if (inst->isMicroOp() && !inst->isLastMicroOp()) { + next_MicroPC++; + } else { + next_PC = next_NPC; + next_NPC = next_NPC + instSize; + next_MicroPC = 0; + } + inst->setPredTarg(next_PC, next_NPC, next_MicroPC); inst->setPredTaken(false); return false; } + //Assume for now that all control flow is to a different macroop which + //would reset the micro pc to 0. + next_MicroPC = 0; + int tid = inst->threadNumber; Addr pred_PC = next_PC; predict_taken = branchPred.predict(inst, pred_PC, tid); @@ -534,7 +539,7 @@ DefaultFetch::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, #endif /* DPRINTF(Fetch, "[tid:%i]: Branch predicted to go to %#x and then %#x.\n", tid, next_PC, next_NPC);*/ - inst->setPredTarg(next_PC, next_NPC); + inst->setPredTarg(next_PC, next_NPC, next_MicroPC); inst->setPredTaken(predict_taken); ++fetchedBranches; @@ -658,14 +663,14 @@ DefaultFetch::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid template inline void DefaultFetch::doSquash(const Addr &new_PC, - const Addr &new_NPC, unsigned tid) + const Addr &new_NPC, const Addr &new_microPC, unsigned tid) { DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x, NPC to: %#x.\n", tid, new_PC, new_NPC); PC[tid] = new_PC; nextPC[tid] = new_NPC; - nextNPC[tid] = new_NPC + instSize; + microPC[tid] = new_microPC; // Clear the icache miss if it's outstanding. if (fetchStatus[tid] == IcacheWaitResponse) { @@ -693,12 +698,12 @@ DefaultFetch::doSquash(const Addr &new_PC, template void DefaultFetch::squashFromDecode(const Addr &new_PC, const Addr &new_NPC, - const InstSeqNum &seq_num, - unsigned tid) + const Addr &new_MicroPC, + const InstSeqNum &seq_num, unsigned tid) { DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid); - doSquash(new_PC, new_NPC, tid); + doSquash(new_PC, new_NPC, new_MicroPC, tid); // Tell the CPU to remove any instructions that are in flight between // fetch and decode. @@ -774,20 +779,15 @@ DefaultFetch::updateFetchStatus() template void DefaultFetch::squash(const Addr &new_PC, const Addr &new_NPC, - const InstSeqNum &seq_num, - bool squash_delay_slot, unsigned tid) + const Addr &new_MicroPC, + const InstSeqNum &seq_num, unsigned tid) { DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid); - doSquash(new_PC, new_NPC, tid); + doSquash(new_PC, new_NPC, new_MicroPC, tid); -#if ISA_HAS_DELAY_SLOT // Tell the CPU to remove any instructions that are not in the ROB. - cpu->removeInstsNotInROB(tid, squash_delay_slot, seq_num); -#else - // Tell the CPU to remove any instructions that are not in the ROB. - cpu->removeInstsNotInROB(tid, true, 0); -#endif + cpu->removeInstsNotInROB(tid); } template @@ -896,17 +896,11 @@ DefaultFetch::checkSignalsAndUpdate(unsigned tid) DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash " "from commit.\n",tid); - -#if ISA_HAS_DELAY_SLOT - InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].bdelayDoneSeqNum; -#else - InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].doneSeqNum; -#endif // In any case, squash. squash(fromCommit->commitInfo[tid].nextPC, fromCommit->commitInfo[tid].nextNPC, - doneSeqNum, - fromCommit->commitInfo[tid].squashDelaySlot, + fromCommit->commitInfo[tid].nextMicroPC, + fromCommit->commitInfo[tid].doneSeqNum, tid); // Also check if there's a mispredict that happened. @@ -955,18 +949,14 @@ DefaultFetch::checkSignalsAndUpdate(unsigned tid) if (fetchStatus[tid] != Squashing) { -#if ISA_HAS_DELAY_SLOT - InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].bdelayDoneSeqNum; -#else - InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].doneSeqNum; -#endif DPRINTF(Fetch, "Squashing from decode with PC = %#x, NPC = %#x\n", fromDecode->decodeInfo[tid].nextPC, fromDecode->decodeInfo[tid].nextNPC); // Squash unless we're already squashing squashFromDecode(fromDecode->decodeInfo[tid].nextPC, fromDecode->decodeInfo[tid].nextNPC, - doneSeqNum, + fromDecode->decodeInfo[tid].nextMicroPC, + fromDecode->decodeInfo[tid].doneSeqNum, tid); return true; @@ -1020,9 +1010,9 @@ DefaultFetch::fetch(bool &status_change) DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid); // The current PC. - Addr &fetch_PC = PC[tid]; - - Addr &fetch_NPC = nextPC[tid]; + Addr fetch_PC = PC[tid]; + Addr fetch_NPC = nextPC[tid]; + Addr fetch_MicroPC = microPC[tid]; // Fault code for memory access. Fault fault = NoFault; @@ -1081,6 +1071,7 @@ DefaultFetch::fetch(bool &status_change) Addr next_PC = fetch_PC; Addr next_NPC = fetch_NPC; + Addr next_MicroPC = fetch_MicroPC; InstSeqNum inst_seq; MachInst inst; @@ -1088,6 +1079,9 @@ DefaultFetch::fetch(bool &status_change) // @todo: Fix this hack. unsigned offset = (fetch_PC & cacheBlkMask) & ~3; + StaticInstPtr staticInst = NULL; + StaticInstPtr macroop = NULL; + if (fault == NoFault) { // If the read of the first instruction was successful, then grab the // instructions from the rest of the cache line and put them into the @@ -1100,11 +1094,9 @@ DefaultFetch::fetch(bool &status_change) // ended this fetch block. bool predicted_branch = false; - for (; - offset < cacheBlkSize && - numInst < fetchWidth && - !predicted_branch; - ++numInst) { + while (offset < cacheBlkSize && + numInst < fetchWidth && + !predicted_branch) { // If we're branching after this instruction, quite fetching // from the same block then. @@ -1115,91 +1107,103 @@ DefaultFetch::fetch(bool &status_change) fetch_PC, fetch_NPC); } - - // Get a sequence number. - inst_seq = cpu->getAndIncrementInstSeq(); - // Make sure this is a valid index. assert(offset <= cacheBlkSize - instSize); - // Get the instruction from the array of the cache line. - inst = TheISA::gtoh(*reinterpret_cast - (&cacheData[tid][offset])); + if (!macroop) { + // Get the instruction from the array of the cache line. + inst = TheISA::gtoh(*reinterpret_cast + (&cacheData[tid][offset])); - predecoder.setTC(cpu->thread[tid]->getTC()); - predecoder.moreBytes(fetch_PC, 0, inst); + predecoder.setTC(cpu->thread[tid]->getTC()); + predecoder.moreBytes(fetch_PC, 0, inst); - ext_inst = predecoder.getExtMachInst(); - - // Create a new DynInst from the instruction fetched. - DynInstPtr instruction = new DynInst(ext_inst, - fetch_PC, fetch_NPC, - next_PC, next_NPC, - inst_seq, cpu); - instruction->setTid(tid); - - instruction->setASID(tid); - - instruction->setThreadState(cpu->thread[tid]); - - DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created " - "[sn:%lli]\n", - tid, instruction->readPC(), inst_seq); - - //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst); - - DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n", - tid, instruction->staticInst->disassemble(fetch_PC)); - - instruction->traceData = - Trace::getInstRecord(curTick, cpu->tcBase(tid), - instruction->staticInst, - instruction->readPC()); - - ///FIXME This needs to be more robust in dealing with delay slots -#if !ISA_HAS_DELAY_SLOT -// predicted_branch |= -#endif - lookupAndUpdateNextPC(instruction, next_PC, next_NPC); - predicted_branch |= (next_PC != fetch_NPC); - - // Add instruction to the CPU's list of instructions. - instruction->setInstListIt(cpu->addInst(instruction)); - - // Write the instruction to the first slot in the queue - // that heads to decode. - toDecode->insts[numInst] = instruction; - - toDecode->size++; - - // Increment stat of fetched instructions. - ++fetchedInsts; - - // Move to the next instruction, unless we have a branch. - fetch_PC = next_PC; - fetch_NPC = next_NPC; - - if (instruction->isQuiesce()) { - DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!", - curTick); - fetchStatus[tid] = QuiescePending; - ++numInst; - status_change = true; - break; + ext_inst = predecoder.getExtMachInst(); + staticInst = StaticInstPtr(ext_inst); + if (staticInst->isMacroOp()) + macroop = staticInst; } + do { + if (macroop) { + staticInst = macroop->fetchMicroOp(fetch_MicroPC); + if (staticInst->isLastMicroOp()) + macroop = NULL; + } + // Get a sequence number. + inst_seq = cpu->getAndIncrementInstSeq(); + + // Create a new DynInst from the instruction fetched. + DynInstPtr instruction = new DynInst(staticInst, + fetch_PC, fetch_NPC, fetch_MicroPC, + next_PC, next_NPC, next_MicroPC, + inst_seq, cpu); + instruction->setTid(tid); + + instruction->setASID(tid); + + instruction->setThreadState(cpu->thread[tid]); + + DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created " + "[sn:%lli]\n", + tid, instruction->readPC(), inst_seq); + + //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst); + + DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n", + tid, instruction->staticInst->disassemble(fetch_PC)); + + instruction->traceData = + Trace::getInstRecord(curTick, cpu->tcBase(tid), + instruction->staticInst, + instruction->readPC()); + + ///FIXME This needs to be more robust in dealing with delay slots + predicted_branch |= + lookupAndUpdateNextPC(instruction, next_PC, next_NPC, next_MicroPC); + + // Add instruction to the CPU's list of instructions. + instruction->setInstListIt(cpu->addInst(instruction)); + + // Write the instruction to the first slot in the queue + // that heads to decode. + toDecode->insts[numInst] = instruction; + + toDecode->size++; + + // Increment stat of fetched instructions. + ++fetchedInsts; + + // Move to the next instruction, unless we have a branch. + fetch_PC = next_PC; + fetch_NPC = next_NPC; + fetch_MicroPC = next_MicroPC; + + if (instruction->isQuiesce()) { + DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!", + curTick); + fetchStatus[tid] = QuiescePending; + ++numInst; + status_change = true; + break; + } + + ++numInst; + } while (staticInst->isMicroOp() && + !staticInst->isLastMicroOp() && + numInst < fetchWidth); offset += instSize; } - if (offset >= cacheBlkSize) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " - "block.\n", tid); + if (predicted_branch) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " + "instruction encountered.\n", tid); } else if (numInst >= fetchWidth) { DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " "for this cycle.\n", tid); - } else if (predicted_branch) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " - "instruction encountered.\n", tid); + } else if (offset >= cacheBlkSize) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " + "block.\n", tid); } } @@ -1212,12 +1216,8 @@ DefaultFetch::fetch(bool &status_change) if (fault == NoFault) { PC[tid] = next_PC; nextPC[tid] = next_NPC; - nextNPC[tid] = next_NPC + instSize; -#if ISA_HAS_DELAY_SLOT - DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, PC[tid]); -#else + microPC[tid] = next_MicroPC; DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, next_PC); -#endif } else { // We shouldn't be in an icache miss and also have a fault (an ITB // miss) @@ -1235,8 +1235,9 @@ DefaultFetch::fetch(bool &status_change) // We will use a nop in order to carry the fault. ext_inst = TheISA::NoopMachInst; + StaticInstPtr staticInst = new StaticInst(ext_inst); // Create a new DynInst from the dummy nop. - DynInstPtr instruction = new DynInst(ext_inst, + DynInstPtr instruction = new DynInst(staticInst, fetch_PC, fetch_NPC, next_PC, next_NPC, inst_seq, cpu); diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index ce2991cfb..eef5a15d2 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -402,9 +402,6 @@ class DefaultIEW /** Records if there is a fetch redirect on this cycle for each thread. */ bool fetchRedirect[Impl::MaxThreads]; - /** Keeps track of the last valid branch delay slot instss for threads */ - InstSeqNum bdelayDoneSeqNum[Impl::MaxThreads]; - /** Used to track if all instructions have been dispatched this cycle. * If they have not, then blocking must have occurred, and the instructions * would already be added to the skid buffer. diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 62e656e93..399c44909 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -69,7 +69,6 @@ DefaultIEW::DefaultIEW(O3CPU *_cpu, Params *params) dispatchStatus[i] = Running; stalls[i].commit = false; fetchRedirect[i] = false; - bdelayDoneSeqNum[i] = 0; } wbMax = wbWidth * params->wbDepth; @@ -410,31 +409,14 @@ DefaultIEW::squash(unsigned tid) instQueue.squash(tid); // Tell the LDSTQ to start squashing. -#if ISA_HAS_DELAY_SLOT - ldstQueue.squash(fromCommit->commitInfo[tid].bdelayDoneSeqNum, tid); -#else ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum, tid); -#endif updatedQueues = true; // Clear the skid buffer in case it has any data in it. DPRINTF(IEW, "[tid:%i]: Removing skidbuffer instructions until [sn:%i].\n", - tid, fromCommit->commitInfo[tid].bdelayDoneSeqNum); + tid, fromCommit->commitInfo[tid].doneSeqNum); while (!skidBuffer[tid].empty()) { -#if ISA_HAS_DELAY_SLOT - if (skidBuffer[tid].front()->seqNum <= - fromCommit->commitInfo[tid].bdelayDoneSeqNum) { - DPRINTF(IEW, "[tid:%i]: Cannot remove skidbuffer instructions " - "that occur before delay slot [sn:%i].\n", - fromCommit->commitInfo[tid].bdelayDoneSeqNum, - tid); - break; - } else { - DPRINTF(IEW, "[tid:%i]: Removing instruction [sn:%i] from " - "skidBuffer.\n", tid, skidBuffer[tid].front()->seqNum); - } -#endif if (skidBuffer[tid].front()->isLoad() || skidBuffer[tid].front()->isStore() ) { toRename->iewInfo[tid].dispatchedToLSQ++; @@ -445,8 +427,6 @@ DefaultIEW::squash(unsigned tid) skidBuffer[tid].pop(); } - bdelayDoneSeqNum[tid] = fromCommit->commitInfo[tid].bdelayDoneSeqNum; - emptyRenameInsts(tid); } @@ -462,38 +442,19 @@ DefaultIEW::squashDueToBranch(DynInstPtr &inst, unsigned tid) toCommit->mispredPC[tid] = inst->readPC(); toCommit->branchMispredict[tid] = true; - int instSize = sizeof(TheISA::MachInst); #if ISA_HAS_DELAY_SLOT - bool branch_taken = + int instSize = sizeof(TheISA::MachInst); + toCommit->branchTaken[tid] = !(inst->readNextPC() + instSize == inst->readNextNPC() && (inst->readNextPC() == inst->readPC() + instSize || inst->readNextPC() == inst->readPC() + 2 * instSize)); - DPRINTF(Sparc, "Branch taken = %s [sn:%i]\n", - branch_taken ? "true": "false", inst->seqNum); - - toCommit->branchTaken[tid] = branch_taken; - - bool squashDelaySlot = true; -// (inst->readNextPC() != inst->readPC() + sizeof(TheISA::MachInst)); - DPRINTF(Sparc, "Squash delay slot = %s [sn:%i]\n", - squashDelaySlot ? "true": "false", inst->seqNum); - toCommit->squashDelaySlot[tid] = squashDelaySlot; - //If we're squashing the delay slot, we need to pick back up at NextPC. - //Otherwise, NextPC isn't being squashed, so we should pick back up at - //NextNPC. - if (squashDelaySlot) { - toCommit->nextPC[tid] = inst->readNextPC(); - toCommit->nextNPC[tid] = inst->readNextNPC(); - } else { - toCommit->nextPC[tid] = inst->readNextNPC(); - toCommit->nextNPC[tid] = inst->readNextNPC() + instSize; - } #else toCommit->branchTaken[tid] = inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst)); - toCommit->nextPC[tid] = inst->readNextPC(); - toCommit->nextNPC[tid] = inst->readNextPC() + instSize; #endif + toCommit->nextPC[tid] = inst->readNextPC(); + toCommit->nextNPC[tid] = inst->readNextNPC(); + toCommit->nextMicroPC[tid] = inst->readNextMicroPC(); toCommit->includeSquashInst[tid] = false; @@ -510,11 +471,7 @@ DefaultIEW::squashDueToMemOrder(DynInstPtr &inst, unsigned tid) toCommit->squash[tid] = true; toCommit->squashedSeqNum[tid] = inst->seqNum; toCommit->nextPC[tid] = inst->readNextPC(); -#if ISA_HAS_DELAY_SLOT toCommit->nextNPC[tid] = inst->readNextNPC(); -#else - toCommit->nextNPC[tid] = inst->readNextPC() + sizeof(TheISA::MachInst); -#endif toCommit->branchMispredict[tid] = false; toCommit->includeSquashInst[tid] = false; @@ -532,11 +489,7 @@ DefaultIEW::squashDueToMemBlocked(DynInstPtr &inst, unsigned tid) toCommit->squash[tid] = true; toCommit->squashedSeqNum[tid] = inst->seqNum; toCommit->nextPC[tid] = inst->readPC(); -#if ISA_HAS_DELAY_SLOT toCommit->nextNPC[tid] = inst->readNextPC(); -#else - toCommit->nextNPC[tid] = inst->readPC() + sizeof(TheISA::MachInst); -#endif toCommit->branchMispredict[tid] = false; // Must include the broadcasted SN in the squash. @@ -880,10 +833,8 @@ DefaultIEW::sortInsts() { int insts_from_rename = fromRename->size; #ifdef DEBUG -#if !ISA_HAS_DELAY_SLOT for (int i = 0; i < numThreads; i++) assert(insts[i].empty()); -#endif #endif for (int i = 0; i < insts_from_rename; ++i) { insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]); @@ -894,21 +845,9 @@ template void DefaultIEW::emptyRenameInsts(unsigned tid) { - DPRINTF(IEW, "[tid:%i]: Removing incoming rename instructions until " - "[sn:%i].\n", tid, bdelayDoneSeqNum[tid]); + DPRINTF(IEW, "[tid:%i]: Removing incoming rename instructions\n", tid); while (!insts[tid].empty()) { -#if ISA_HAS_DELAY_SLOT - if (insts[tid].front()->seqNum <= bdelayDoneSeqNum[tid]) { - DPRINTF(IEW, "[tid:%i]: Done removing, cannot remove instruction" - " that occurs at or before delay slot [sn:%i].\n", - tid, bdelayDoneSeqNum[tid]); - break; - } else { - DPRINTF(IEW, "[tid:%i]: Removing incoming rename instruction " - "[sn:%i].\n", tid, insts[tid].front()->seqNum); - } -#endif if (insts[tid].front()->isLoad() || insts[tid].front()->isStore() ) { diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index 10c3287f2..bdf5f07aa 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -1005,11 +1005,7 @@ InstructionQueue::squash(unsigned tid) // Read instruction sequence number of last instruction out of the // time buffer. -#if ISA_HAS_DELAY_SLOT - squashedSeqNum[tid] = fromCommit->commitInfo[tid].bdelayDoneSeqNum; -#else squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum; -#endif // Call doSquash if there are insts in the IQ if (count[tid] > 0) { diff --git a/src/cpu/o3/mips/dyn_inst.hh b/src/cpu/o3/mips/dyn_inst.hh index 366b4bb23..cf78c0941 100755 --- a/src/cpu/o3/mips/dyn_inst.hh +++ b/src/cpu/o3/mips/dyn_inst.hh @@ -69,10 +69,16 @@ class MipsDynInst : public BaseDynInst }; public: + /** BaseDynInst constructor given a binary instruction. */ + MipsDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); + /** BaseDynInst constructor given a binary instruction. */ MipsDynInst(ExtMachInst inst, - Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu); /** BaseDynInst constructor given a static inst pointer. */ diff --git a/src/cpu/o3/mips/dyn_inst_impl.hh b/src/cpu/o3/mips/dyn_inst_impl.hh index c0f9ae771..7e8697b32 100755 --- a/src/cpu/o3/mips/dyn_inst_impl.hh +++ b/src/cpu/o3/mips/dyn_inst_impl.hh @@ -31,11 +31,23 @@ #include "cpu/o3/mips/dyn_inst.hh" template -MipsDynInst::MipsDynInst(ExtMachInst inst, - Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, +MipsDynInst::MipsDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu) - : BaseDynInst(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu) + : BaseDynInst(staticInst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) +{ + initVars(); +} + +template +MipsDynInst::MipsDynInst(ExtMachInst inst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu) + : BaseDynInst(inst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) { initVars(); } diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index 431705e19..d78de2c87 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -356,47 +356,12 @@ DefaultRename::squash(const InstSeqNum &squash_seq_num, unsigned tid) } // Clear the instruction list and skid buffer in case they have any - // insts in them. Since we support multiple ISAs, we cant just: - // "insts[tid].clear();" or "skidBuffer[tid].clear()" since there is - // a possible delay slot inst for different architectures - // insts[tid].clear(); -#if ISA_HAS_DELAY_SLOT - DPRINTF(Rename, "[tid:%i] Squashing incoming decode instructions until " - "[sn:%i].\n",tid, squash_seq_num); - ListIt ilist_it = insts[tid].begin(); - while (ilist_it != insts[tid].end()) { - if ((*ilist_it)->seqNum > squash_seq_num) { - (*ilist_it)->setSquashed(); - DPRINTF(Rename, "Squashing incoming decode instruction, " - "[tid:%i] [sn:%i] PC %08p.\n", tid, (*ilist_it)->seqNum, (*ilist_it)->PC); - } - ilist_it++; - } -#else + // insts in them. insts[tid].clear(); -#endif // Clear the skid buffer in case it has any data in it. - // See comments above. - // skidBuffer[tid].clear(); -#if ISA_HAS_DELAY_SLOT - DPRINTF(Rename, "[tid:%i] Squashing incoming skidbuffer instructions " - "until [sn:%i].\n", tid, squash_seq_num); - ListIt slist_it = skidBuffer[tid].begin(); - while (slist_it != skidBuffer[tid].end()) { - if ((*slist_it)->seqNum > squash_seq_num) { - (*slist_it)->setSquashed(); - DPRINTF(Rename, "Squashing skidbuffer instruction, [tid:%i] [sn:%i]" - "PC %08p.\n", tid, (*slist_it)->seqNum, (*slist_it)->PC); - } - slist_it++; - } - resumeUnblocking = (skidBuffer[tid].size() != 0); - DPRINTF(Rename, "Resume unblocking set to %s\n", - resumeUnblocking ? "true" : "false"); -#else skidBuffer[tid].clear(); -#endif + doSquash(squash_seq_num, tid); } @@ -776,10 +741,8 @@ DefaultRename::sortInsts() { int insts_from_decode = fromDecode->size; #ifdef DEBUG -#if !ISA_HAS_DELAY_SLOT for (int i=0; i < numThreads; i++) assert(insts[i].empty()); -#endif #endif for (int i = 0; i < insts_from_decode; ++i) { DynInstPtr inst = fromDecode->insts[i]; @@ -1000,6 +963,7 @@ DefaultRename::renameSrcRegs(DynInstPtr &inst,unsigned tid) // Floating point and Miscellaneous registers need their indexes // adjusted to account for the expanded number of flattened int regs. flat_src_reg = src_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs; + DPRINTF(Rename, "Adjusting reg index from %d to %d.\n", src_reg, flat_src_reg); } inst->flattenSrcReg(src_idx, flat_src_reg); @@ -1016,9 +980,11 @@ DefaultRename::renameSrcRegs(DynInstPtr &inst,unsigned tid) // See if the register is ready or not. if (scoreboard->getReg(renamed_reg) == true) { - DPRINTF(Rename, "[tid:%u]: Register is ready.\n", tid); + DPRINTF(Rename, "[tid:%u]: Register %d is ready.\n", tid, renamed_reg); inst->markSrcRegReady(src_idx); + } else { + DPRINTF(Rename, "[tid:%u]: Register %d is not ready.\n", tid, renamed_reg); } ++renameRenameLookups; @@ -1045,6 +1011,7 @@ DefaultRename::renameDestRegs(DynInstPtr &inst,unsigned tid) // Floating point and Miscellaneous registers need their indexes // adjusted to account for the expanded number of flattened int regs. flat_dest_reg = dest_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs; + DPRINTF(Rename, "Adjusting reg index from %d to %d.\n", dest_reg, flat_dest_reg); } inst->flattenDestReg(dest_idx, flat_dest_reg); @@ -1248,13 +1215,7 @@ DefaultRename::checkSignalsAndUpdate(unsigned tid) DPRINTF(Rename, "[tid:%u]: Squashing instructions due to squash from " "commit.\n", tid); -#if ISA_HAS_DELAY_SLOT - InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].bdelayDoneSeqNum; -#else - InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum; -#endif - - squash(squashed_seq_num, tid); + squash(fromCommit->commitInfo[tid].doneSeqNum, tid); return true; } diff --git a/src/cpu/o3/sparc/dyn_inst.hh b/src/cpu/o3/sparc/dyn_inst.hh index 72242b161..a7ab6cd79 100644 --- a/src/cpu/o3/sparc/dyn_inst.hh +++ b/src/cpu/o3/sparc/dyn_inst.hh @@ -56,8 +56,14 @@ class SparcDynInst : public BaseDynInst public: /** BaseDynInst constructor given a binary instruction. */ - SparcDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, InstSeqNum seq_num, O3CPU *cpu); + SparcDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); + + /** BaseDynInst constructor given a binary instruction. */ + SparcDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); /** BaseDynInst constructor given a static inst pointer. */ SparcDynInst(StaticInstPtr &_staticInst); diff --git a/src/cpu/o3/sparc/dyn_inst_impl.hh b/src/cpu/o3/sparc/dyn_inst_impl.hh index c4d30b6f4..6bfe97717 100644 --- a/src/cpu/o3/sparc/dyn_inst_impl.hh +++ b/src/cpu/o3/sparc/dyn_inst_impl.hh @@ -31,10 +31,23 @@ #include "cpu/o3/sparc/dyn_inst.hh" template -SparcDynInst::SparcDynInst(TheISA::ExtMachInst inst, - Addr PC, Addr NPC, Addr Pred_PC, Addr Pred_NPC, +SparcDynInst::SparcDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu) - : BaseDynInst(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu) + : BaseDynInst(staticInst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) +{ + initVars(); +} + +template +SparcDynInst::SparcDynInst(TheISA::ExtMachInst inst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu) + : BaseDynInst(inst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) { initVars(); } diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt index 7c0d31494..4c5655a33 100644 --- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt +++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt @@ -1,17 +1,17 @@ ---------- Begin Simulation Statistics ---------- global.BPredUnit.BTBCorrect 0 # Number of correct BTB predictions (this stat may not work properly. -global.BPredUnit.BTBHits 2990 # Number of BTB hits -global.BPredUnit.BTBLookups 7055 # Number of BTB lookups +global.BPredUnit.BTBHits 3021 # Number of BTB hits +global.BPredUnit.BTBLookups 7086 # Number of BTB lookups global.BPredUnit.RASInCorrect 0 # Number of incorrect RAS predictions. global.BPredUnit.condIncorrect 2077 # Number of conditional branches incorrect -global.BPredUnit.condPredicted 7846 # Number of conditional branches predicted -global.BPredUnit.lookups 7846 # Number of BP lookups +global.BPredUnit.condPredicted 7877 # Number of conditional branches predicted +global.BPredUnit.lookups 7877 # Number of BP lookups global.BPredUnit.usedRAS 0 # Number of times the RAS was used to get a target. -host_inst_rate 15119 # Simulator instruction rate (inst/s) -host_mem_usage 154868 # Number of bytes of host memory used -host_seconds 0.73 # Real time elapsed on the host -host_tick_rate 1956796 # Simulator tick rate (ticks/s) +host_inst_rate 4388 # Simulator instruction rate (inst/s) +host_mem_usage 179936 # Number of bytes of host memory used +host_seconds 2.50 # Real time elapsed on the host +host_tick_rate 568121 # Simulator tick rate (ticks/s) memdepunit.memDep.conflictingLoads 12 # Number of conflicting loads. memdepunit.memDep.conflictingStores 0 # Number of conflicting stores. memdepunit.memDep.insertedLoads 3250 # Number of loads inserted to the mem dependence unit. @@ -19,22 +19,22 @@ memdepunit.memDep.insertedStores 2817 # Nu sim_freq 1000000000000 # Frequency of simulated ticks sim_insts 10976 # Number of instructions simulated sim_seconds 0.000001 # Number of seconds simulated -sim_ticks 1421211 # Number of ticks simulated +sim_ticks 1421207 # Number of ticks simulated system.cpu.commit.COM:branches 2152 # Number of branches committed -system.cpu.commit.COM:bw_lim_events 172 # number cycles where commit BW limit reached +system.cpu.commit.COM:bw_lim_events 225 # number cycles where commit BW limit reached system.cpu.commit.COM:bw_limited 0 # number of insts not committed due to BW limits system.cpu.commit.COM:committed_per_cycle.start_dist # Number of insts commited each cycle -system.cpu.commit.COM:committed_per_cycle.samples 221349 +system.cpu.commit.COM:committed_per_cycle.samples 220766 system.cpu.commit.COM:committed_per_cycle.min_value 0 - 0 215844 9751.30% - 1 2970 134.18% - 2 1290 58.28% - 3 631 28.51% - 4 208 9.40% - 5 90 4.07% - 6 133 6.01% + 0 215368 9755.49% + 1 2915 132.04% + 2 1196 54.18% + 3 673 30.48% + 4 208 9.42% + 5 79 3.58% + 6 91 4.12% 7 11 0.50% - 8 172 7.77% + 8 225 10.19% system.cpu.commit.COM:committed_per_cycle.max_value 8 system.cpu.commit.COM:committed_per_cycle.end_dist @@ -49,65 +49,65 @@ system.cpu.commit.commitNonSpecStalls 327 # Th system.cpu.commit.commitSquashedInsts 14263 # The number of squashed insts skipped by commit system.cpu.committedInsts 10976 # Number of Instructions Simulated system.cpu.committedInsts_total 10976 # Number of Instructions Simulated -system.cpu.cpi 129.483509 # CPI: Cycles Per Instruction -system.cpu.cpi_total 129.483509 # CPI: Total CPI of All Threads -system.cpu.dcache.ReadReq_accesses 2737 # number of ReadReq accesses(hits+misses) -system.cpu.dcache.ReadReq_avg_miss_latency 6585.044776 # average ReadReq miss latency -system.cpu.dcache.ReadReq_avg_mshr_miss_latency 6511.939394 # average ReadReq mshr miss latency -system.cpu.dcache.ReadReq_hits 2603 # number of ReadReq hits -system.cpu.dcache.ReadReq_miss_latency 882396 # number of ReadReq miss cycles -system.cpu.dcache.ReadReq_miss_rate 0.048959 # miss rate for ReadReq accesses +system.cpu.cpi 129.483145 # CPI: Cycles Per Instruction +system.cpu.cpi_total 129.483145 # CPI: Total CPI of All Threads +system.cpu.dcache.ReadReq_accesses 2738 # number of ReadReq accesses(hits+misses) +system.cpu.dcache.ReadReq_avg_miss_latency 6586.074627 # average ReadReq miss latency +system.cpu.dcache.ReadReq_avg_mshr_miss_latency 6513.166667 # average ReadReq mshr miss latency +system.cpu.dcache.ReadReq_hits 2604 # number of ReadReq hits +system.cpu.dcache.ReadReq_miss_latency 882534 # number of ReadReq miss cycles +system.cpu.dcache.ReadReq_miss_rate 0.048941 # miss rate for ReadReq accesses system.cpu.dcache.ReadReq_misses 134 # number of ReadReq misses system.cpu.dcache.ReadReq_mshr_hits 68 # number of ReadReq MSHR hits -system.cpu.dcache.ReadReq_mshr_miss_latency 429788 # number of ReadReq MSHR miss cycles -system.cpu.dcache.ReadReq_mshr_miss_rate 0.024114 # mshr miss rate for ReadReq accesses +system.cpu.dcache.ReadReq_mshr_miss_latency 429869 # number of ReadReq MSHR miss cycles +system.cpu.dcache.ReadReq_mshr_miss_rate 0.024105 # mshr miss rate for ReadReq accesses system.cpu.dcache.ReadReq_mshr_misses 66 # number of ReadReq MSHR misses system.cpu.dcache.SwapReq_accesses 6 # number of SwapReq accesses(hits+misses) system.cpu.dcache.SwapReq_hits 6 # number of SwapReq hits system.cpu.dcache.WriteReq_accesses 1292 # number of WriteReq accesses(hits+misses) -system.cpu.dcache.WriteReq_avg_miss_latency 7960.583924 # average WriteReq miss latency -system.cpu.dcache.WriteReq_avg_mshr_miss_latency 7136.918605 # average WriteReq mshr miss latency +system.cpu.dcache.WriteReq_avg_miss_latency 7962.583924 # average WriteReq miss latency +system.cpu.dcache.WriteReq_avg_mshr_miss_latency 7138.593023 # average WriteReq mshr miss latency system.cpu.dcache.WriteReq_hits 869 # number of WriteReq hits -system.cpu.dcache.WriteReq_miss_latency 3367327 # number of WriteReq miss cycles +system.cpu.dcache.WriteReq_miss_latency 3368173 # number of WriteReq miss cycles system.cpu.dcache.WriteReq_miss_rate 0.327399 # miss rate for WriteReq accesses system.cpu.dcache.WriteReq_misses 423 # number of WriteReq misses system.cpu.dcache.WriteReq_mshr_hits 337 # number of WriteReq MSHR hits -system.cpu.dcache.WriteReq_mshr_miss_latency 613775 # number of WriteReq MSHR miss cycles +system.cpu.dcache.WriteReq_mshr_miss_latency 613919 # number of WriteReq MSHR miss cycles system.cpu.dcache.WriteReq_mshr_miss_rate 0.066563 # mshr miss rate for WriteReq accesses system.cpu.dcache.WriteReq_mshr_misses 86 # number of WriteReq MSHR misses system.cpu.dcache.avg_blocked_cycles_no_mshrs # average number of cycles each access was blocked system.cpu.dcache.avg_blocked_cycles_no_targets # average number of cycles each access was blocked -system.cpu.dcache.avg_refs 22.881579 # Average number of references to valid blocks. +system.cpu.dcache.avg_refs 22.888158 # Average number of references to valid blocks. system.cpu.dcache.blocked_no_mshrs 0 # number of cycles access was blocked system.cpu.dcache.blocked_no_targets 0 # number of cycles access was blocked system.cpu.dcache.blocked_cycles_no_mshrs 0 # number of cycles access was blocked system.cpu.dcache.blocked_cycles_no_targets 0 # number of cycles access was blocked system.cpu.dcache.cache_copies 0 # number of cache copies performed -system.cpu.dcache.demand_accesses 4029 # number of demand (read+write) accesses -system.cpu.dcache.demand_avg_miss_latency 7629.664273 # average overall miss latency -system.cpu.dcache.demand_avg_mshr_miss_latency 6865.546053 # average overall mshr miss latency -system.cpu.dcache.demand_hits 3472 # number of demand (read+write) hits -system.cpu.dcache.demand_miss_latency 4249723 # number of demand (read+write) miss cycles -system.cpu.dcache.demand_miss_rate 0.138248 # miss rate for demand accesses +system.cpu.dcache.demand_accesses 4030 # number of demand (read+write) accesses +system.cpu.dcache.demand_avg_miss_latency 7631.430880 # average overall miss latency +system.cpu.dcache.demand_avg_mshr_miss_latency 6867.026316 # average overall mshr miss latency +system.cpu.dcache.demand_hits 3473 # number of demand (read+write) hits +system.cpu.dcache.demand_miss_latency 4250707 # number of demand (read+write) miss cycles +system.cpu.dcache.demand_miss_rate 0.138213 # miss rate for demand accesses system.cpu.dcache.demand_misses 557 # number of demand (read+write) misses system.cpu.dcache.demand_mshr_hits 405 # number of demand (read+write) MSHR hits -system.cpu.dcache.demand_mshr_miss_latency 1043563 # number of demand (read+write) MSHR miss cycles -system.cpu.dcache.demand_mshr_miss_rate 0.037726 # mshr miss rate for demand accesses +system.cpu.dcache.demand_mshr_miss_latency 1043788 # number of demand (read+write) MSHR miss cycles +system.cpu.dcache.demand_mshr_miss_rate 0.037717 # mshr miss rate for demand accesses system.cpu.dcache.demand_mshr_misses 152 # number of demand (read+write) MSHR misses system.cpu.dcache.fast_writes 0 # number of fast writes performed system.cpu.dcache.mshr_cap_events 0 # number of times MSHR cap was activated system.cpu.dcache.no_allocate_misses 0 # Number of misses that were no-allocate -system.cpu.dcache.overall_accesses 4029 # number of overall (read+write) accesses -system.cpu.dcache.overall_avg_miss_latency 7629.664273 # average overall miss latency -system.cpu.dcache.overall_avg_mshr_miss_latency 6865.546053 # average overall mshr miss latency +system.cpu.dcache.overall_accesses 4030 # number of overall (read+write) accesses +system.cpu.dcache.overall_avg_miss_latency 7631.430880 # average overall miss latency +system.cpu.dcache.overall_avg_mshr_miss_latency 6867.026316 # average overall mshr miss latency system.cpu.dcache.overall_avg_mshr_uncacheable_latency # average overall mshr uncacheable latency -system.cpu.dcache.overall_hits 3472 # number of overall hits -system.cpu.dcache.overall_miss_latency 4249723 # number of overall miss cycles -system.cpu.dcache.overall_miss_rate 0.138248 # miss rate for overall accesses +system.cpu.dcache.overall_hits 3473 # number of overall hits +system.cpu.dcache.overall_miss_latency 4250707 # number of overall miss cycles +system.cpu.dcache.overall_miss_rate 0.138213 # miss rate for overall accesses system.cpu.dcache.overall_misses 557 # number of overall misses system.cpu.dcache.overall_mshr_hits 405 # number of overall MSHR hits -system.cpu.dcache.overall_mshr_miss_latency 1043563 # number of overall MSHR miss cycles -system.cpu.dcache.overall_mshr_miss_rate 0.037726 # mshr miss rate for overall accesses +system.cpu.dcache.overall_mshr_miss_latency 1043788 # number of overall MSHR miss cycles +system.cpu.dcache.overall_mshr_miss_rate 0.037717 # mshr miss rate for overall accesses system.cpu.dcache.overall_mshr_misses 152 # number of overall MSHR misses system.cpu.dcache.overall_mshr_uncacheable_latency 0 # number of overall MSHR uncacheable cycles system.cpu.dcache.overall_mshr_uncacheable_misses 0 # number of overall MSHR uncacheable misses @@ -123,50 +123,50 @@ system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss 0 system.cpu.dcache.replacements 0 # number of replacements system.cpu.dcache.sampled_refs 152 # Sample count of references to valid blocks. system.cpu.dcache.soft_prefetch_mshr_full 0 # number of mshr full events for SW prefetching instrutions -system.cpu.dcache.tagsinuse 90.938737 # Cycle average of tags in use -system.cpu.dcache.total_refs 3478 # Total number of references to valid blocks. +system.cpu.dcache.tagsinuse 90.938565 # Cycle average of tags in use +system.cpu.dcache.total_refs 3479 # Total number of references to valid blocks. system.cpu.dcache.warmup_cycle 0 # Cycle when the warmup percentage was hit. system.cpu.dcache.writebacks 0 # number of writebacks -system.cpu.decode.DECODE:BlockedCycles 192719 # Number of cycles decode is blocked -system.cpu.decode.DECODE:DecodedInsts 39774 # Number of instructions handled by decode -system.cpu.decode.DECODE:IdleCycles 20128 # Number of cycles decode is idle -system.cpu.decode.DECODE:RunCycles 8238 # Number of cycles decode is running +system.cpu.decode.DECODE:BlockedCycles 192302 # Number of cycles decode is blocked +system.cpu.decode.DECODE:DecodedInsts 39763 # Number of instructions handled by decode +system.cpu.decode.DECODE:IdleCycles 19973 # Number of cycles decode is idle +system.cpu.decode.DECODE:RunCycles 8441 # Number of cycles decode is running system.cpu.decode.DECODE:SquashCycles 3162 # Number of cycles decode is squashing -system.cpu.decode.DECODE:UnblockCycles 264 # Number of cycles decode is unblocking -system.cpu.fetch.Branches 7846 # Number of branches that fetch encountered +system.cpu.decode.DECODE:UnblockCycles 50 # Number of cycles decode is unblocking +system.cpu.fetch.Branches 7877 # Number of branches that fetch encountered system.cpu.fetch.CacheLines 5085 # Number of cache lines fetched -system.cpu.fetch.Cycles 14399 # Number of cycles fetch has run and was not squashing or blocked +system.cpu.fetch.Cycles 14430 # Number of cycles fetch has run and was not squashing or blocked system.cpu.fetch.IcacheSquashes 745 # Number of outstanding Icache misses that were squashed -system.cpu.fetch.Insts 43304 # Number of instructions fetch has processed +system.cpu.fetch.Insts 43366 # Number of instructions fetch has processed system.cpu.fetch.SquashCycles 2134 # Number of cycles fetch has spent squashing -system.cpu.fetch.branchRate 0.034947 # Number of branch fetches per cycle +system.cpu.fetch.branchRate 0.035176 # Number of branch fetches per cycle system.cpu.fetch.icacheStallCycles 5085 # Number of cycles fetch is stalled on an Icache miss -system.cpu.fetch.predictedBranches 2990 # Number of branches that fetch has predicted taken -system.cpu.fetch.rate 0.192881 # Number of inst fetches per cycle +system.cpu.fetch.predictedBranches 3021 # Number of branches that fetch has predicted taken +system.cpu.fetch.rate 0.193660 # Number of inst fetches per cycle system.cpu.fetch.rateDist.start_dist # Number of instructions fetched each cycle (Total) -system.cpu.fetch.rateDist.samples 224511 +system.cpu.fetch.rateDist.samples 223928 system.cpu.fetch.rateDist.min_value 0 - 0 215198 9585.19% - 1 2258 100.57% - 2 627 27.93% - 3 958 42.67% - 4 553 24.63% - 5 816 36.35% - 6 951 42.36% - 7 280 12.47% - 8 2870 127.83% + 0 214584 9582.72% + 1 2258 100.84% + 2 658 29.38% + 3 958 42.78% + 4 553 24.70% + 5 816 36.44% + 6 951 42.47% + 7 280 12.50% + 8 2870 128.17% system.cpu.fetch.rateDist.max_value 8 system.cpu.fetch.rateDist.end_dist system.cpu.icache.ReadReq_accesses 5085 # number of ReadReq accesses(hits+misses) -system.cpu.icache.ReadReq_avg_miss_latency 5148.266776 # average ReadReq miss latency -system.cpu.icache.ReadReq_avg_mshr_miss_latency 4502.972752 # average ReadReq mshr miss latency +system.cpu.icache.ReadReq_avg_miss_latency 5150.152209 # average ReadReq miss latency +system.cpu.icache.ReadReq_avg_mshr_miss_latency 4503.673025 # average ReadReq mshr miss latency system.cpu.icache.ReadReq_hits 4474 # number of ReadReq hits -system.cpu.icache.ReadReq_miss_latency 3145591 # number of ReadReq miss cycles +system.cpu.icache.ReadReq_miss_latency 3146743 # number of ReadReq miss cycles system.cpu.icache.ReadReq_miss_rate 0.120157 # miss rate for ReadReq accesses system.cpu.icache.ReadReq_misses 611 # number of ReadReq misses system.cpu.icache.ReadReq_mshr_hits 244 # number of ReadReq MSHR hits -system.cpu.icache.ReadReq_mshr_miss_latency 1652591 # number of ReadReq MSHR miss cycles +system.cpu.icache.ReadReq_mshr_miss_latency 1652848 # number of ReadReq MSHR miss cycles system.cpu.icache.ReadReq_mshr_miss_rate 0.072173 # mshr miss rate for ReadReq accesses system.cpu.icache.ReadReq_mshr_misses 367 # number of ReadReq MSHR misses system.cpu.icache.avg_blocked_cycles_no_mshrs # average number of cycles each access was blocked @@ -178,29 +178,29 @@ system.cpu.icache.blocked_cycles_no_mshrs 0 # n system.cpu.icache.blocked_cycles_no_targets 0 # number of cycles access was blocked system.cpu.icache.cache_copies 0 # number of cache copies performed system.cpu.icache.demand_accesses 5085 # number of demand (read+write) accesses -system.cpu.icache.demand_avg_miss_latency 5148.266776 # average overall miss latency -system.cpu.icache.demand_avg_mshr_miss_latency 4502.972752 # average overall mshr miss latency +system.cpu.icache.demand_avg_miss_latency 5150.152209 # average overall miss latency +system.cpu.icache.demand_avg_mshr_miss_latency 4503.673025 # average overall mshr miss latency system.cpu.icache.demand_hits 4474 # number of demand (read+write) hits -system.cpu.icache.demand_miss_latency 3145591 # number of demand (read+write) miss cycles +system.cpu.icache.demand_miss_latency 3146743 # number of demand (read+write) miss cycles system.cpu.icache.demand_miss_rate 0.120157 # miss rate for demand accesses system.cpu.icache.demand_misses 611 # number of demand (read+write) misses system.cpu.icache.demand_mshr_hits 244 # number of demand (read+write) MSHR hits -system.cpu.icache.demand_mshr_miss_latency 1652591 # number of demand (read+write) MSHR miss cycles +system.cpu.icache.demand_mshr_miss_latency 1652848 # number of demand (read+write) MSHR miss cycles system.cpu.icache.demand_mshr_miss_rate 0.072173 # mshr miss rate for demand accesses system.cpu.icache.demand_mshr_misses 367 # number of demand (read+write) MSHR misses system.cpu.icache.fast_writes 0 # number of fast writes performed system.cpu.icache.mshr_cap_events 0 # number of times MSHR cap was activated system.cpu.icache.no_allocate_misses 0 # Number of misses that were no-allocate system.cpu.icache.overall_accesses 5085 # number of overall (read+write) accesses -system.cpu.icache.overall_avg_miss_latency 5148.266776 # average overall miss latency -system.cpu.icache.overall_avg_mshr_miss_latency 4502.972752 # average overall mshr miss latency +system.cpu.icache.overall_avg_miss_latency 5150.152209 # average overall miss latency +system.cpu.icache.overall_avg_mshr_miss_latency 4503.673025 # average overall mshr miss latency system.cpu.icache.overall_avg_mshr_uncacheable_latency # average overall mshr uncacheable latency system.cpu.icache.overall_hits 4474 # number of overall hits -system.cpu.icache.overall_miss_latency 3145591 # number of overall miss cycles +system.cpu.icache.overall_miss_latency 3146743 # number of overall miss cycles system.cpu.icache.overall_miss_rate 0.120157 # miss rate for overall accesses system.cpu.icache.overall_misses 611 # number of overall misses system.cpu.icache.overall_mshr_hits 244 # number of overall MSHR hits -system.cpu.icache.overall_mshr_miss_latency 1652591 # number of overall MSHR miss cycles +system.cpu.icache.overall_mshr_miss_latency 1652848 # number of overall MSHR miss cycles system.cpu.icache.overall_mshr_miss_rate 0.072173 # mshr miss rate for overall accesses system.cpu.icache.overall_mshr_misses 367 # number of overall MSHR misses system.cpu.icache.overall_mshr_uncacheable_latency 0 # number of overall MSHR uncacheable cycles @@ -217,35 +217,35 @@ system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss 0 system.cpu.icache.replacements 1 # number of replacements system.cpu.icache.sampled_refs 363 # Sample count of references to valid blocks. system.cpu.icache.soft_prefetch_mshr_full 0 # number of mshr full events for SW prefetching instrutions -system.cpu.icache.tagsinuse 172.869174 # Cycle average of tags in use +system.cpu.icache.tagsinuse 172.868641 # Cycle average of tags in use system.cpu.icache.total_refs 4474 # Total number of references to valid blocks. system.cpu.icache.warmup_cycle 0 # Cycle when the warmup percentage was hit. system.cpu.icache.writebacks 0 # number of writebacks -system.cpu.idleCycles 1196701 # Total number of cycles that the CPU has spent unscheduled due to idling -system.cpu.iew.EXEC:branches 3576 # Number of branches executed +system.cpu.idleCycles 1197280 # Total number of cycles that the CPU has spent unscheduled due to idling +system.cpu.iew.EXEC:branches 3577 # Number of branches executed system.cpu.iew.EXEC:nop 0 # number of nop insts executed -system.cpu.iew.EXEC:rate 0.092548 # Inst execution rate -system.cpu.iew.EXEC:refs 5257 # number of memory reference insts executed +system.cpu.iew.EXEC:rate 0.092802 # Inst execution rate +system.cpu.iew.EXEC:refs 5258 # number of memory reference insts executed system.cpu.iew.EXEC:stores 2386 # Number of stores executed system.cpu.iew.EXEC:swp 0 # number of swp insts executed system.cpu.iew.WB:consumers 9737 # num instructions consuming a value -system.cpu.iew.WB:count 19769 # cumulative count of insts written-back +system.cpu.iew.WB:count 19771 # cumulative count of insts written-back system.cpu.iew.WB:fanout 0.790901 # average fanout of values written-back system.cpu.iew.WB:penalized 0 # number of instrctions required to write to 'other' IQ system.cpu.iew.WB:penalized_rate 0 # fraction of instructions written-back that wrote to 'other' IQ system.cpu.iew.WB:producers 7701 # num instructions producing a value -system.cpu.iew.WB:rate 0.088054 # insts written-back per cycle -system.cpu.iew.WB:sent 20061 # cumulative count of insts sent to commit -system.cpu.iew.branchMispredicts 2593 # Number of branch mispredicts detected at execute +system.cpu.iew.WB:rate 0.088292 # insts written-back per cycle +system.cpu.iew.WB:sent 20063 # cumulative count of insts sent to commit +system.cpu.iew.branchMispredicts 2594 # Number of branch mispredicts detected at execute system.cpu.iew.iewBlockCycles 476 # Number of cycles IEW is blocking system.cpu.iew.iewDispLoadInsts 3250 # Number of dispatched load instructions system.cpu.iew.iewDispNonSpecInsts 617 # Number of dispatched non-speculative instructions -system.cpu.iew.iewDispSquashedInsts 2705 # Number of squashed instructions skipped by dispatch +system.cpu.iew.iewDispSquashedInsts 2694 # Number of squashed instructions skipped by dispatch system.cpu.iew.iewDispStoreInsts 2817 # Number of dispatched store instructions system.cpu.iew.iewDispatchedInsts 25240 # Number of instructions dispatched to IQ -system.cpu.iew.iewExecLoadInsts 2871 # Number of load instructions executed -system.cpu.iew.iewExecSquashedInsts 1780 # Number of squashed instructions skipped in execute -system.cpu.iew.iewExecutedInsts 20778 # Number of executed instructions +system.cpu.iew.iewExecLoadInsts 2872 # Number of load instructions executed +system.cpu.iew.iewExecSquashedInsts 1777 # Number of squashed instructions skipped in execute +system.cpu.iew.iewExecutedInsts 20781 # Number of executed instructions system.cpu.iew.iewIQFullEvents 7 # Number of times the IQ has become full, causing a stall system.cpu.iew.iewIdleCycles 0 # Number of cycles IEW is idle system.cpu.iew.iewLSQFullEvents 0 # Number of times the LSQ has become full, causing a stall @@ -262,7 +262,7 @@ system.cpu.iew.lsq.thread.0.rescheduledLoads 0 system.cpu.iew.lsq.thread.0.squashedLoads 1788 # Number of loads squashed system.cpu.iew.lsq.thread.0.squashedStores 1519 # Number of stores squashed system.cpu.iew.memOrderViolationEvents 54 # Number of memory order violations -system.cpu.iew.predictedNotTakenIncorrect 962 # Number of branches that were predicted not taken incorrectly +system.cpu.iew.predictedNotTakenIncorrect 963 # Number of branches that were predicted not taken incorrectly system.cpu.iew.predictedTakenIncorrect 1631 # Number of branches that were predicted taken incorrectly system.cpu.ipc 0.007723 # IPC: Instructions Per Cycle system.cpu.ipc_total 0.007723 # IPC: Total IPC of All Threads @@ -302,21 +302,21 @@ system.cpu.iq.ISSUE:fu_full.start_dist InstPrefetch 0 0.00% # attempts to use FU when none available system.cpu.iq.ISSUE:fu_full.end_dist system.cpu.iq.ISSUE:issued_per_cycle.start_dist # Number of insts issued each cycle -system.cpu.iq.ISSUE:issued_per_cycle.samples 224511 +system.cpu.iq.ISSUE:issued_per_cycle.samples 223928 system.cpu.iq.ISSUE:issued_per_cycle.min_value 0 - 0 215315 9590.40% - 1 4124 183.69% - 2 1297 57.77% - 3 1306 58.17% - 4 1190 53.00% - 5 707 31.49% - 6 433 19.29% - 7 83 3.70% - 8 56 2.49% + 0 214838 9594.07% + 1 3976 177.56% + 2 1244 55.55% + 3 1359 60.69% + 4 1316 58.77% + 5 612 27.33% + 6 444 19.83% + 7 83 3.71% + 8 56 2.50% system.cpu.iq.ISSUE:issued_per_cycle.max_value 8 system.cpu.iq.ISSUE:issued_per_cycle.end_dist -system.cpu.iq.ISSUE:rate 0.100476 # Inst issue rate +system.cpu.iq.ISSUE:rate 0.100738 # Inst issue rate system.cpu.iq.iqInstsAdded 24623 # Number of instructions added to the IQ (excludes non-spec) system.cpu.iq.iqInstsIssued 22558 # Number of instructions issued system.cpu.iq.iqNonSpecInstsAdded 617 # Number of non-speculative instructions added to the IQ @@ -325,12 +325,12 @@ system.cpu.iq.iqSquashedInstsIssued 174 # Nu system.cpu.iq.iqSquashedNonSpecRemoved 290 # Number of squashed non-spec instructions that were removed system.cpu.iq.iqSquashedOperandsExamined 5834 # Number of squashed operands that are examined and possibly removed from graph system.cpu.l2cache.ReadReq_accesses 513 # number of ReadReq accesses(hits+misses) -system.cpu.l2cache.ReadReq_avg_miss_latency 4754.779727 # average ReadReq miss latency -system.cpu.l2cache.ReadReq_avg_mshr_miss_latency 2343.506823 # average ReadReq mshr miss latency -system.cpu.l2cache.ReadReq_miss_latency 2439202 # number of ReadReq miss cycles +system.cpu.l2cache.ReadReq_avg_miss_latency 4755.715400 # average ReadReq miss latency +system.cpu.l2cache.ReadReq_avg_mshr_miss_latency 2343.752437 # average ReadReq mshr miss latency +system.cpu.l2cache.ReadReq_miss_latency 2439682 # number of ReadReq miss cycles system.cpu.l2cache.ReadReq_miss_rate 1 # miss rate for ReadReq accesses system.cpu.l2cache.ReadReq_misses 513 # number of ReadReq misses -system.cpu.l2cache.ReadReq_mshr_miss_latency 1202219 # number of ReadReq MSHR miss cycles +system.cpu.l2cache.ReadReq_mshr_miss_latency 1202345 # number of ReadReq MSHR miss cycles system.cpu.l2cache.ReadReq_mshr_miss_rate 1 # mshr miss rate for ReadReq accesses system.cpu.l2cache.ReadReq_mshr_misses 513 # number of ReadReq MSHR misses system.cpu.l2cache.avg_blocked_cycles_no_mshrs # average number of cycles each access was blocked @@ -342,29 +342,29 @@ system.cpu.l2cache.blocked_cycles_no_mshrs 0 # system.cpu.l2cache.blocked_cycles_no_targets 0 # number of cycles access was blocked system.cpu.l2cache.cache_copies 0 # number of cache copies performed system.cpu.l2cache.demand_accesses 513 # number of demand (read+write) accesses -system.cpu.l2cache.demand_avg_miss_latency 4754.779727 # average overall miss latency -system.cpu.l2cache.demand_avg_mshr_miss_latency 2343.506823 # average overall mshr miss latency +system.cpu.l2cache.demand_avg_miss_latency 4755.715400 # average overall miss latency +system.cpu.l2cache.demand_avg_mshr_miss_latency 2343.752437 # average overall mshr miss latency system.cpu.l2cache.demand_hits 0 # number of demand (read+write) hits -system.cpu.l2cache.demand_miss_latency 2439202 # number of demand (read+write) miss cycles +system.cpu.l2cache.demand_miss_latency 2439682 # number of demand (read+write) miss cycles system.cpu.l2cache.demand_miss_rate 1 # miss rate for demand accesses system.cpu.l2cache.demand_misses 513 # number of demand (read+write) misses system.cpu.l2cache.demand_mshr_hits 0 # number of demand (read+write) MSHR hits -system.cpu.l2cache.demand_mshr_miss_latency 1202219 # number of demand (read+write) MSHR miss cycles +system.cpu.l2cache.demand_mshr_miss_latency 1202345 # number of demand (read+write) MSHR miss cycles system.cpu.l2cache.demand_mshr_miss_rate 1 # mshr miss rate for demand accesses system.cpu.l2cache.demand_mshr_misses 513 # number of demand (read+write) MSHR misses system.cpu.l2cache.fast_writes 0 # number of fast writes performed system.cpu.l2cache.mshr_cap_events 0 # number of times MSHR cap was activated system.cpu.l2cache.no_allocate_misses 0 # Number of misses that were no-allocate system.cpu.l2cache.overall_accesses 513 # number of overall (read+write) accesses -system.cpu.l2cache.overall_avg_miss_latency 4754.779727 # average overall miss latency -system.cpu.l2cache.overall_avg_mshr_miss_latency 2343.506823 # average overall mshr miss latency +system.cpu.l2cache.overall_avg_miss_latency 4755.715400 # average overall miss latency +system.cpu.l2cache.overall_avg_mshr_miss_latency 2343.752437 # average overall mshr miss latency system.cpu.l2cache.overall_avg_mshr_uncacheable_latency # average overall mshr uncacheable latency system.cpu.l2cache.overall_hits 0 # number of overall hits -system.cpu.l2cache.overall_miss_latency 2439202 # number of overall miss cycles +system.cpu.l2cache.overall_miss_latency 2439682 # number of overall miss cycles system.cpu.l2cache.overall_miss_rate 1 # miss rate for overall accesses system.cpu.l2cache.overall_misses 513 # number of overall misses system.cpu.l2cache.overall_mshr_hits 0 # number of overall MSHR hits -system.cpu.l2cache.overall_mshr_miss_latency 1202219 # number of overall MSHR miss cycles +system.cpu.l2cache.overall_mshr_miss_latency 1202345 # number of overall MSHR miss cycles system.cpu.l2cache.overall_mshr_miss_rate 1 # mshr miss rate for overall accesses system.cpu.l2cache.overall_mshr_misses 513 # number of overall MSHR misses system.cpu.l2cache.overall_mshr_uncacheable_latency 0 # number of overall MSHR uncacheable cycles @@ -381,28 +381,27 @@ system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss 0 system.cpu.l2cache.replacements 0 # number of replacements system.cpu.l2cache.sampled_refs 512 # Sample count of references to valid blocks. system.cpu.l2cache.soft_prefetch_mshr_full 0 # number of mshr full events for SW prefetching instrutions -system.cpu.l2cache.tagsinuse 262.946375 # Cycle average of tags in use +system.cpu.l2cache.tagsinuse 262.945674 # Cycle average of tags in use system.cpu.l2cache.total_refs 0 # Total number of references to valid blocks. system.cpu.l2cache.warmup_cycle 0 # Cycle when the warmup percentage was hit. system.cpu.l2cache.writebacks 0 # number of writebacks -system.cpu.numCycles 224511 # number of cpu cycles simulated +system.cpu.numCycles 223928 # number of cpu cycles simulated system.cpu.rename.RENAME:BlockCycles 960 # Number of cycles rename is blocking system.cpu.rename.RENAME:CommittedMaps 9868 # Number of HB maps that are committed system.cpu.rename.RENAME:IQFullEvents 2 # Number of times rename has blocked due to IQ full -system.cpu.rename.RENAME:IdleCycles 20098 # Number of cycles rename is idle -system.cpu.rename.RENAME:LSQFullEvents 481 # Number of times rename has blocked due to LSQ full +system.cpu.rename.RENAME:IdleCycles 21302 # Number of cycles rename is idle +system.cpu.rename.RENAME:LSQFullEvents 411 # Number of times rename has blocked due to LSQ full system.cpu.rename.RENAME:ROBFullEvents 4 # Number of times rename has blocked due to ROB full system.cpu.rename.RENAME:RenameLookups 46931 # Number of register rename lookups that rename has made -system.cpu.rename.RENAME:RenamedInsts 31260 # Number of instructions processed by rename +system.cpu.rename.RENAME:RenamedInsts 31249 # Number of instructions processed by rename system.cpu.rename.RENAME:RenamedOperands 25831 # Number of destination operands rename has renamed -system.cpu.rename.RENAME:RunCycles 7921 # Number of cycles rename is running +system.cpu.rename.RENAME:RunCycles 7136 # Number of cycles rename is running system.cpu.rename.RENAME:SquashCycles 3162 # Number of cycles rename is squashing -system.cpu.rename.RENAME:SquashedInsts 8042 # Number of squashed instructions processed by rename -system.cpu.rename.RENAME:UnblockCycles 1212 # Number of cycles rename is unblocking +system.cpu.rename.RENAME:UnblockCycles 614 # Number of cycles rename is unblocking system.cpu.rename.RENAME:UndoneMaps 15963 # Number of HB maps that are undone due to squashing -system.cpu.rename.RENAME:serializeStallCycles 190573 # count of cycles rename stalled for serializing inst +system.cpu.rename.RENAME:serializeStallCycles 190754 # count of cycles rename stalled for serializing inst system.cpu.rename.RENAME:serializingInsts 638 # count of serializing insts renamed -system.cpu.rename.RENAME:skidInsts 5594 # count of insts added to the skid buffer +system.cpu.rename.RENAME:skidInsts 5529 # count of insts added to the skid buffer system.cpu.rename.RENAME:tempSerializingInsts 629 # count of temporary serializing insts renamed system.cpu.timesIdled 289 # Number of times that the entire CPU went into an idle state and unscheduled itself system.cpu.workload.PROG:num_syscalls 8 # Number of system calls diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr index 48affb0e2..7873672f2 100644 --- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr +++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr @@ -1,4 +1,3 @@ warn: More than two loadable segments in ELF object. warn: Ignoring segment @ 0x0 length 0x0. -0: system.remote_gdb.listener: listening for remote gdb on port 7003 warn: Entering event queue @ 0. Starting simulation... diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout index 6cba2ba7e..38b0c1787 100644 --- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout +++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout @@ -16,9 +16,9 @@ The Regents of The University of Michigan All Rights Reserved -M5 compiled Apr 9 2007 03:06:26 -M5 started Mon Apr 9 03:06:54 2007 -M5 executing on zizzer.eecs.umich.edu +M5 compiled Apr 13 2007 13:56:34 +M5 started Fri Apr 13 13:56:35 2007 +M5 executing on ahchoo.blinky.homelinux.org command line: build/SPARC_SE/m5.fast -d build/SPARC_SE/tests/fast/quick/02.insttest/sparc/linux/o3-timing tests/run.py quick/02.insttest/sparc/linux/o3-timing Global frequency set at 1000000000000 ticks per second -Exiting @ tick 1421211 because target called exit() +Exiting @ tick 1421207 because target called exit()