From c7f1cf1d58cf50118c18b1afc4c938eafba81492 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Fri, 13 Apr 2007 13:59:31 +0000 Subject: [PATCH 1/7] Remove most of the special handling for delay slots since they have to be squashed anyway on a mispredict. This is because the NNPC value they saw when executing was incorrect. --HG-- extra : convert_revision : b42c4eb28b4fbba66c65cbd0a5033bf886c1532d --- src/cpu/base_dyn_inst.hh | 2 + src/cpu/o3/comm.hh | 5 --- src/cpu/o3/commit_impl.hh | 75 ----------------------------------- src/cpu/o3/cpu.cc | 12 +----- src/cpu/o3/cpu.hh | 3 +- src/cpu/o3/decode_impl.hh | 65 +----------------------------- src/cpu/o3/fetch.hh | 3 +- src/cpu/o3/fetch_impl.hh | 33 ++------------- src/cpu/o3/iew.hh | 3 -- src/cpu/o3/iew_impl.hh | 74 +++------------------------------- src/cpu/o3/inst_queue_impl.hh | 4 -- src/cpu/o3/rename_impl.hh | 49 ++--------------------- 12 files changed, 20 insertions(+), 308 deletions(-) diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index eed05c2f1..b02038b3e 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -709,7 +709,9 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Set the next NPC of this instruction (the target in Mips or Sparc).*/ void setNextNPC(uint64_t val) { +#if ISA_HAS_DELAY_SLOT nextNPC = val; +#endif } /** Sets the ASID. */ diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index d96919007..8d7bb95f4 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -87,7 +87,6 @@ struct DefaultIEWDefaultCommit { bool squash[Impl::MaxThreads]; bool branchMispredict[Impl::MaxThreads]; bool branchTaken[Impl::MaxThreads]; - bool squashDelaySlot[Impl::MaxThreads]; uint64_t mispredPC[Impl::MaxThreads]; uint64_t nextPC[Impl::MaxThreads]; uint64_t nextNPC[Impl::MaxThreads]; @@ -114,7 +113,6 @@ struct TimeBufStruct { uint64_t branchAddr; InstSeqNum doneSeqNum; - InstSeqNum bdelayDoneSeqNum; // @todo: Might want to package this kind of branch stuff into a single // struct as it is used pretty frequently. @@ -169,9 +167,6 @@ struct TimeBufStruct { // retired or squashed sequence number. InstSeqNum doneSeqNum; - InstSeqNum bdelayDoneSeqNum; - bool squashDelaySlot; - //Just in case we want to do a commit/squash on a cycle //(necessary for multiple ROBs?) bool commitInsts; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 65625065d..9dd5ed291 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -741,38 +741,15 @@ DefaultCommit::commit() // then use one older sequence number. InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid]; -#if ISA_HAS_DELAY_SLOT - InstSeqNum bdelay_done_seq_num = squashed_inst; - bool squash_bdelay_slot = fromIEW->squashDelaySlot[tid]; - bool branchMispredict = fromIEW->branchMispredict[tid]; - - // Squashing/not squashing the branch delay slot only makes - // sense when you're squashing from a branch, ie from a branch - // mispredict. - if (branchMispredict && !squash_bdelay_slot) { - bdelay_done_seq_num++; - } -#endif - if (fromIEW->includeSquashInst[tid] == true) { squashed_inst--; -#if ISA_HAS_DELAY_SLOT - bdelay_done_seq_num--; -#endif } // All younger instructions will be squashed. Set the sequence // number as the youngest instruction in the ROB. youngestSeqNum[tid] = squashed_inst; -#if ISA_HAS_DELAY_SLOT - rob->squash(bdelay_done_seq_num, tid); - toIEW->commitInfo[tid].squashDelaySlot = squash_bdelay_slot; - toIEW->commitInfo[tid].bdelayDoneSeqNum = bdelay_done_seq_num; -#else rob->squash(squashed_inst, tid); - toIEW->commitInfo[tid].squashDelaySlot = true; -#endif changedROBNumEntries[tid] = true; toIEW->commitInfo[tid].doneSeqNum = squashed_inst; @@ -809,10 +786,6 @@ DefaultCommit::commit() // Try to commit any instructions. commitInsts(); - } else { -#if ISA_HAS_DELAY_SLOT - skidInsert(); -#endif } //Check for any activity @@ -1164,37 +1137,13 @@ DefaultCommit::getInsts() { DPRINTF(Commit, "Getting instructions from Rename stage.\n"); -#if ISA_HAS_DELAY_SLOT - // Read any renamed instructions and place them into the ROB. - int insts_to_process = std::min((int)renameWidth, - (int)(fromRename->size + skidBuffer.size())); - int rename_idx = 0; - - DPRINTF(Commit, "%i insts available to process. Rename Insts:%i " - "SkidBuffer Insts:%i\n", insts_to_process, fromRename->size, - skidBuffer.size()); -#else // Read any renamed instructions and place them into the ROB. int insts_to_process = std::min((int)renameWidth, fromRename->size); -#endif - for (int inst_num = 0; inst_num < insts_to_process; ++inst_num) { DynInstPtr inst; -#if ISA_HAS_DELAY_SLOT - // Get insts from skidBuffer or from Rename - if (skidBuffer.size() > 0) { - DPRINTF(Commit, "Grabbing skidbuffer inst.\n"); - inst = skidBuffer.front(); - skidBuffer.pop(); - } else { - DPRINTF(Commit, "Grabbing rename inst.\n"); - inst = fromRename->insts[rename_idx++]; - } -#else inst = fromRename->insts[inst_num]; -#endif int tid = inst->threadNumber; if (!inst->isSquashed() && @@ -1216,30 +1165,6 @@ DefaultCommit::getInsts() inst->readPC(), inst->seqNum, tid); } } - -#if ISA_HAS_DELAY_SLOT - if (rename_idx < fromRename->size) { - DPRINTF(Commit,"Placing Rename Insts into skidBuffer.\n"); - - for (; - rename_idx < fromRename->size; - rename_idx++) { - DynInstPtr inst = fromRename->insts[rename_idx]; - - if (!inst->isSquashed()) { - DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ", - "skidBuffer.\n", inst->readPC(), inst->seqNum, - inst->threadNumber); - skidBuffer.push(inst); - } else { - DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was " - "squashed, skipping.\n", - inst->readPC(), inst->seqNum, inst->threadNumber); - } - } - } -#endif - } template diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 2e6a43f9c..b2b4645d2 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -696,7 +696,7 @@ FullO3CPU::removeThread(unsigned tid) // Squash Throughout Pipeline InstSeqNum squash_seq_num = commit.rob->readHeadInst(tid)->seqNum; - fetch.squash(0, sizeof(TheISA::MachInst), squash_seq_num, true, tid); + fetch.squash(0, sizeof(TheISA::MachInst), squash_seq_num, tid); decode.squash(tid); rename.squash(squash_seq_num, tid); iew.squash(tid); @@ -1226,9 +1226,7 @@ FullO3CPU::removeFrontInst(DynInstPtr &inst) template void -FullO3CPU::removeInstsNotInROB(unsigned tid, - bool squash_delay_slot, - const InstSeqNum &delay_slot_seq_num) +FullO3CPU::removeInstsNotInROB(unsigned tid) { DPRINTF(O3CPU, "Thread %i: Deleting instructions from instruction" " list.\n", tid); @@ -1259,12 +1257,6 @@ FullO3CPU::removeInstsNotInROB(unsigned tid, while (inst_it != end_it) { assert(!instList.empty()); -#if ISA_HAS_DELAY_SLOT - if(!squash_delay_slot && - delay_slot_seq_num >= (*inst_it)->seqNum) { - break; - } -#endif squashInstIt(inst_it, tid); inst_it--; diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index e71d05c8e..4b247e6e3 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -468,8 +468,7 @@ class FullO3CPU : public BaseO3CPU /** Remove all instructions that are not currently in the ROB. * There's also an option to not squash delay slot instructions.*/ - void removeInstsNotInROB(unsigned tid, bool squash_delay_slot, - const InstSeqNum &delay_slot_seq_num); + void removeInstsNotInROB(unsigned tid); /** Remove all instructions younger than the given sequence number. */ void removeInstsUntil(const InstSeqNum &seq_num,unsigned tid); diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh index 314864f94..c9d0a1885 100644 --- a/src/cpu/o3/decode_impl.hh +++ b/src/cpu/o3/decode_impl.hh @@ -49,8 +49,6 @@ DefaultDecode::DefaultDecode(O3CPU *_cpu, Params *params) stalls[i].rename = false; stalls[i].iew = false; stalls[i].commit = false; - - squashAfterDelaySlot[i] = false; } // @todo: Make into a parameter @@ -278,17 +276,12 @@ DefaultDecode::squash(DynInstPtr &inst, unsigned tid) #if ISA_HAS_DELAY_SLOT toFetch->decodeInfo[tid].branchTaken = inst->readNextNPC() != (inst->readNextPC() + sizeof(TheISA::MachInst)); - - toFetch->decodeInfo[tid].bdelayDoneSeqNum = bdelayDoneSeqNum[tid]; - squashAfterDelaySlot[tid] = false; - - InstSeqNum squash_seq_num = bdelayDoneSeqNum[tid]; #else toFetch->decodeInfo[tid].branchTaken = inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst)); +#endif InstSeqNum squash_seq_num = inst->seqNum; -#endif // Might have to tell fetch to unblock. if (decodeStatus[tid] == Blocked || @@ -309,30 +302,10 @@ DefaultDecode::squash(DynInstPtr &inst, unsigned tid) // Clear the instruction list and skid buffer in case they have any // insts in them. while (!insts[tid].empty()) { - -#if ISA_HAS_DELAY_SLOT - if (insts[tid].front()->seqNum <= squash_seq_num) { - DPRINTF(Decode, "[tid:%i]: Cannot remove incoming decode " - "instructions before delay slot [sn:%i]. %i insts" - "left in decode.\n", tid, squash_seq_num, - insts[tid].size()); - break; - } -#endif insts[tid].pop(); } while (!skidBuffer[tid].empty()) { - -#if ISA_HAS_DELAY_SLOT - if (skidBuffer[tid].front()->seqNum <= squash_seq_num) { - DPRINTF(Decode, "[tid:%i]: Cannot remove skidBuffer " - "instructions before delay slot [sn:%i]. %i insts" - "left in decode.\n", tid, squash_seq_num, - insts[tid].size()); - break; - } -#endif skidBuffer[tid].pop(); } @@ -760,48 +733,12 @@ DefaultDecode::decodeInsts(unsigned tid) // Might want to set some sort of boolean and just do // a check at the end -#if !ISA_HAS_DELAY_SLOT squash(inst, inst->threadNumber); Addr target = inst->branchTarget(); inst->setPredTarg(target, target + sizeof(TheISA::MachInst)); break; -#else - // If mispredicted as taken, then ignore delay slot - // instruction... else keep delay slot and squash - // after it is sent to rename - if (inst->readPredTaken() && inst->isCondDelaySlot()) { - DPRINTF(Decode, "[tid:%i]: Conditional delay slot inst." - "[sn:%i] PC %#x mispredicted as taken.\n", tid, - inst->seqNum, inst->PC); - bdelayDoneSeqNum[tid] = inst->seqNum; - squash(inst, inst->threadNumber); - Addr target = inst->branchTarget(); - inst->setPredTarg(target, - target + sizeof(TheISA::MachInst)); - break; - } else { - DPRINTF(Decode, "[tid:%i]: Misprediction detected at " - "[sn:%i] PC %#x, will squash after delay slot " - "inst. is sent to Rename\n", - tid, inst->seqNum, inst->PC); - bdelayDoneSeqNum[tid] = inst->seqNum + 1; - squashAfterDelaySlot[tid] = true; - squashInst[tid] = inst; - continue; - } -#endif } } - - if (squashAfterDelaySlot[tid]) { - assert(!inst->isSquashed()); - squash(squashInst[tid], squashInst[tid]->threadNumber); - Addr target = squashInst[tid]->branchTarget(); - squashInst[tid]->setPredTarg(target, - target + sizeof(TheISA::MachInst)); - assert(!inst->isSquashed()); - break; - } } // If we didn't process all instructions, then we will need to block diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 241935416..bb0057e7c 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -263,8 +263,7 @@ class DefaultFetch * squash should be the commit stage. */ void squash(const Addr &new_PC, const Addr &new_NPC, - const InstSeqNum &seq_num, - bool squash_delay_slot, unsigned tid); + const InstSeqNum &seq_num, unsigned tid); /** Ticks the fetch stage, processing all inputs signals and fetching * as many instructions as possible. diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index e16f97558..25498c7f3 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -774,20 +774,14 @@ DefaultFetch::updateFetchStatus() template void DefaultFetch::squash(const Addr &new_PC, const Addr &new_NPC, - const InstSeqNum &seq_num, - bool squash_delay_slot, unsigned tid) + const InstSeqNum &seq_num, unsigned tid) { DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid); doSquash(new_PC, new_NPC, tid); -#if ISA_HAS_DELAY_SLOT // Tell the CPU to remove any instructions that are not in the ROB. - cpu->removeInstsNotInROB(tid, squash_delay_slot, seq_num); -#else - // Tell the CPU to remove any instructions that are not in the ROB. - cpu->removeInstsNotInROB(tid, true, 0); -#endif + cpu->removeInstsNotInROB(tid); } template @@ -896,17 +890,10 @@ DefaultFetch::checkSignalsAndUpdate(unsigned tid) DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash " "from commit.\n",tid); - -#if ISA_HAS_DELAY_SLOT - InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].bdelayDoneSeqNum; -#else - InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].doneSeqNum; -#endif // In any case, squash. squash(fromCommit->commitInfo[tid].nextPC, fromCommit->commitInfo[tid].nextNPC, - doneSeqNum, - fromCommit->commitInfo[tid].squashDelaySlot, + fromCommit->commitInfo[tid].doneSeqNum, tid); // Also check if there's a mispredict that happened. @@ -955,18 +942,13 @@ DefaultFetch::checkSignalsAndUpdate(unsigned tid) if (fetchStatus[tid] != Squashing) { -#if ISA_HAS_DELAY_SLOT - InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].bdelayDoneSeqNum; -#else - InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].doneSeqNum; -#endif DPRINTF(Fetch, "Squashing from decode with PC = %#x, NPC = %#x\n", fromDecode->decodeInfo[tid].nextPC, fromDecode->decodeInfo[tid].nextNPC); // Squash unless we're already squashing squashFromDecode(fromDecode->decodeInfo[tid].nextPC, fromDecode->decodeInfo[tid].nextNPC, - doneSeqNum, + fromDecode->decodeInfo[tid].doneSeqNum, tid); return true; @@ -1157,9 +1139,6 @@ DefaultFetch::fetch(bool &status_change) instruction->readPC()); ///FIXME This needs to be more robust in dealing with delay slots -#if !ISA_HAS_DELAY_SLOT -// predicted_branch |= -#endif lookupAndUpdateNextPC(instruction, next_PC, next_NPC); predicted_branch |= (next_PC != fetch_NPC); @@ -1213,11 +1192,7 @@ DefaultFetch::fetch(bool &status_change) PC[tid] = next_PC; nextPC[tid] = next_NPC; nextNPC[tid] = next_NPC + instSize; -#if ISA_HAS_DELAY_SLOT - DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, PC[tid]); -#else DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, next_PC); -#endif } else { // We shouldn't be in an icache miss and also have a fault (an ITB // miss) diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index ce2991cfb..eef5a15d2 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -402,9 +402,6 @@ class DefaultIEW /** Records if there is a fetch redirect on this cycle for each thread. */ bool fetchRedirect[Impl::MaxThreads]; - /** Keeps track of the last valid branch delay slot instss for threads */ - InstSeqNum bdelayDoneSeqNum[Impl::MaxThreads]; - /** Used to track if all instructions have been dispatched this cycle. * If they have not, then blocking must have occurred, and the instructions * would already be added to the skid buffer. diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 62e656e93..050785818 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -69,7 +69,6 @@ DefaultIEW::DefaultIEW(O3CPU *_cpu, Params *params) dispatchStatus[i] = Running; stalls[i].commit = false; fetchRedirect[i] = false; - bdelayDoneSeqNum[i] = 0; } wbMax = wbWidth * params->wbDepth; @@ -410,31 +409,14 @@ DefaultIEW::squash(unsigned tid) instQueue.squash(tid); // Tell the LDSTQ to start squashing. -#if ISA_HAS_DELAY_SLOT - ldstQueue.squash(fromCommit->commitInfo[tid].bdelayDoneSeqNum, tid); -#else ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum, tid); -#endif updatedQueues = true; // Clear the skid buffer in case it has any data in it. DPRINTF(IEW, "[tid:%i]: Removing skidbuffer instructions until [sn:%i].\n", - tid, fromCommit->commitInfo[tid].bdelayDoneSeqNum); + tid, fromCommit->commitInfo[tid].doneSeqNum); while (!skidBuffer[tid].empty()) { -#if ISA_HAS_DELAY_SLOT - if (skidBuffer[tid].front()->seqNum <= - fromCommit->commitInfo[tid].bdelayDoneSeqNum) { - DPRINTF(IEW, "[tid:%i]: Cannot remove skidbuffer instructions " - "that occur before delay slot [sn:%i].\n", - fromCommit->commitInfo[tid].bdelayDoneSeqNum, - tid); - break; - } else { - DPRINTF(IEW, "[tid:%i]: Removing instruction [sn:%i] from " - "skidBuffer.\n", tid, skidBuffer[tid].front()->seqNum); - } -#endif if (skidBuffer[tid].front()->isLoad() || skidBuffer[tid].front()->isStore() ) { toRename->iewInfo[tid].dispatchedToLSQ++; @@ -445,8 +427,6 @@ DefaultIEW::squash(unsigned tid) skidBuffer[tid].pop(); } - bdelayDoneSeqNum[tid] = fromCommit->commitInfo[tid].bdelayDoneSeqNum; - emptyRenameInsts(tid); } @@ -462,38 +442,18 @@ DefaultIEW::squashDueToBranch(DynInstPtr &inst, unsigned tid) toCommit->mispredPC[tid] = inst->readPC(); toCommit->branchMispredict[tid] = true; - int instSize = sizeof(TheISA::MachInst); #if ISA_HAS_DELAY_SLOT - bool branch_taken = + int instSize = sizeof(TheISA::MachInst); + toCommit->branchTaken[tid] = !(inst->readNextPC() + instSize == inst->readNextNPC() && (inst->readNextPC() == inst->readPC() + instSize || inst->readNextPC() == inst->readPC() + 2 * instSize)); - DPRINTF(Sparc, "Branch taken = %s [sn:%i]\n", - branch_taken ? "true": "false", inst->seqNum); - - toCommit->branchTaken[tid] = branch_taken; - - bool squashDelaySlot = true; -// (inst->readNextPC() != inst->readPC() + sizeof(TheISA::MachInst)); - DPRINTF(Sparc, "Squash delay slot = %s [sn:%i]\n", - squashDelaySlot ? "true": "false", inst->seqNum); - toCommit->squashDelaySlot[tid] = squashDelaySlot; - //If we're squashing the delay slot, we need to pick back up at NextPC. - //Otherwise, NextPC isn't being squashed, so we should pick back up at - //NextNPC. - if (squashDelaySlot) { - toCommit->nextPC[tid] = inst->readNextPC(); - toCommit->nextNPC[tid] = inst->readNextNPC(); - } else { - toCommit->nextPC[tid] = inst->readNextNPC(); - toCommit->nextNPC[tid] = inst->readNextNPC() + instSize; - } #else toCommit->branchTaken[tid] = inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst)); - toCommit->nextPC[tid] = inst->readNextPC(); - toCommit->nextNPC[tid] = inst->readNextPC() + instSize; #endif + toCommit->nextPC[tid] = inst->readNextPC(); + toCommit->nextNPC[tid] = inst->readNextNPC(); toCommit->includeSquashInst[tid] = false; @@ -510,11 +470,7 @@ DefaultIEW::squashDueToMemOrder(DynInstPtr &inst, unsigned tid) toCommit->squash[tid] = true; toCommit->squashedSeqNum[tid] = inst->seqNum; toCommit->nextPC[tid] = inst->readNextPC(); -#if ISA_HAS_DELAY_SLOT toCommit->nextNPC[tid] = inst->readNextNPC(); -#else - toCommit->nextNPC[tid] = inst->readNextPC() + sizeof(TheISA::MachInst); -#endif toCommit->branchMispredict[tid] = false; toCommit->includeSquashInst[tid] = false; @@ -532,11 +488,7 @@ DefaultIEW::squashDueToMemBlocked(DynInstPtr &inst, unsigned tid) toCommit->squash[tid] = true; toCommit->squashedSeqNum[tid] = inst->seqNum; toCommit->nextPC[tid] = inst->readPC(); -#if ISA_HAS_DELAY_SLOT toCommit->nextNPC[tid] = inst->readNextPC(); -#else - toCommit->nextNPC[tid] = inst->readPC() + sizeof(TheISA::MachInst); -#endif toCommit->branchMispredict[tid] = false; // Must include the broadcasted SN in the squash. @@ -880,10 +832,8 @@ DefaultIEW::sortInsts() { int insts_from_rename = fromRename->size; #ifdef DEBUG -#if !ISA_HAS_DELAY_SLOT for (int i = 0; i < numThreads; i++) assert(insts[i].empty()); -#endif #endif for (int i = 0; i < insts_from_rename; ++i) { insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]); @@ -894,21 +844,9 @@ template void DefaultIEW::emptyRenameInsts(unsigned tid) { - DPRINTF(IEW, "[tid:%i]: Removing incoming rename instructions until " - "[sn:%i].\n", tid, bdelayDoneSeqNum[tid]); + DPRINTF(IEW, "[tid:%i]: Removing incoming rename instructions\n", tid); while (!insts[tid].empty()) { -#if ISA_HAS_DELAY_SLOT - if (insts[tid].front()->seqNum <= bdelayDoneSeqNum[tid]) { - DPRINTF(IEW, "[tid:%i]: Done removing, cannot remove instruction" - " that occurs at or before delay slot [sn:%i].\n", - tid, bdelayDoneSeqNum[tid]); - break; - } else { - DPRINTF(IEW, "[tid:%i]: Removing incoming rename instruction " - "[sn:%i].\n", tid, insts[tid].front()->seqNum); - } -#endif if (insts[tid].front()->isLoad() || insts[tid].front()->isStore() ) { diff --git a/src/cpu/o3/inst_queue_impl.hh b/src/cpu/o3/inst_queue_impl.hh index 10c3287f2..bdf5f07aa 100644 --- a/src/cpu/o3/inst_queue_impl.hh +++ b/src/cpu/o3/inst_queue_impl.hh @@ -1005,11 +1005,7 @@ InstructionQueue::squash(unsigned tid) // Read instruction sequence number of last instruction out of the // time buffer. -#if ISA_HAS_DELAY_SLOT - squashedSeqNum[tid] = fromCommit->commitInfo[tid].bdelayDoneSeqNum; -#else squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum; -#endif // Call doSquash if there are insts in the IQ if (count[tid] > 0) { diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index 431705e19..6e7180b1e 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -356,47 +356,12 @@ DefaultRename::squash(const InstSeqNum &squash_seq_num, unsigned tid) } // Clear the instruction list and skid buffer in case they have any - // insts in them. Since we support multiple ISAs, we cant just: - // "insts[tid].clear();" or "skidBuffer[tid].clear()" since there is - // a possible delay slot inst for different architectures - // insts[tid].clear(); -#if ISA_HAS_DELAY_SLOT - DPRINTF(Rename, "[tid:%i] Squashing incoming decode instructions until " - "[sn:%i].\n",tid, squash_seq_num); - ListIt ilist_it = insts[tid].begin(); - while (ilist_it != insts[tid].end()) { - if ((*ilist_it)->seqNum > squash_seq_num) { - (*ilist_it)->setSquashed(); - DPRINTF(Rename, "Squashing incoming decode instruction, " - "[tid:%i] [sn:%i] PC %08p.\n", tid, (*ilist_it)->seqNum, (*ilist_it)->PC); - } - ilist_it++; - } -#else + // insts in them. insts[tid].clear(); -#endif // Clear the skid buffer in case it has any data in it. - // See comments above. - // skidBuffer[tid].clear(); -#if ISA_HAS_DELAY_SLOT - DPRINTF(Rename, "[tid:%i] Squashing incoming skidbuffer instructions " - "until [sn:%i].\n", tid, squash_seq_num); - ListIt slist_it = skidBuffer[tid].begin(); - while (slist_it != skidBuffer[tid].end()) { - if ((*slist_it)->seqNum > squash_seq_num) { - (*slist_it)->setSquashed(); - DPRINTF(Rename, "Squashing skidbuffer instruction, [tid:%i] [sn:%i]" - "PC %08p.\n", tid, (*slist_it)->seqNum, (*slist_it)->PC); - } - slist_it++; - } - resumeUnblocking = (skidBuffer[tid].size() != 0); - DPRINTF(Rename, "Resume unblocking set to %s\n", - resumeUnblocking ? "true" : "false"); -#else skidBuffer[tid].clear(); -#endif + doSquash(squash_seq_num, tid); } @@ -776,10 +741,8 @@ DefaultRename::sortInsts() { int insts_from_decode = fromDecode->size; #ifdef DEBUG -#if !ISA_HAS_DELAY_SLOT for (int i=0; i < numThreads; i++) assert(insts[i].empty()); -#endif #endif for (int i = 0; i < insts_from_decode; ++i) { DynInstPtr inst = fromDecode->insts[i]; @@ -1248,13 +1211,7 @@ DefaultRename::checkSignalsAndUpdate(unsigned tid) DPRINTF(Rename, "[tid:%u]: Squashing instructions due to squash from " "commit.\n", tid); -#if ISA_HAS_DELAY_SLOT - InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].bdelayDoneSeqNum; -#else - InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum; -#endif - - squash(squashed_seq_num, tid); + squash(fromCommit->commitInfo[tid].doneSeqNum, tid); return true; } From e9c6012acf729ef55b37dda76e011b5a284b6988 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Fri, 13 Apr 2007 14:00:42 +0000 Subject: [PATCH 2/7] Adjust references to reflect differences without special delay slot handling. Performance actually went up slightly. --HG-- extra : convert_revision : 504f6185ddc89881aa41deb7fd934da8038d1ed2 --- .../ref/sparc/linux/o3-timing/m5stats.txt | 263 +++++++++--------- .../ref/sparc/linux/o3-timing/stderr | 1 - .../ref/sparc/linux/o3-timing/stdout | 8 +- 3 files changed, 135 insertions(+), 137 deletions(-) diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt index 7c0d31494..4c5655a33 100644 --- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt +++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt @@ -1,17 +1,17 @@ ---------- Begin Simulation Statistics ---------- global.BPredUnit.BTBCorrect 0 # Number of correct BTB predictions (this stat may not work properly. -global.BPredUnit.BTBHits 2990 # Number of BTB hits -global.BPredUnit.BTBLookups 7055 # Number of BTB lookups +global.BPredUnit.BTBHits 3021 # Number of BTB hits +global.BPredUnit.BTBLookups 7086 # Number of BTB lookups global.BPredUnit.RASInCorrect 0 # Number of incorrect RAS predictions. global.BPredUnit.condIncorrect 2077 # Number of conditional branches incorrect -global.BPredUnit.condPredicted 7846 # Number of conditional branches predicted -global.BPredUnit.lookups 7846 # Number of BP lookups +global.BPredUnit.condPredicted 7877 # Number of conditional branches predicted +global.BPredUnit.lookups 7877 # Number of BP lookups global.BPredUnit.usedRAS 0 # Number of times the RAS was used to get a target. -host_inst_rate 15119 # Simulator instruction rate (inst/s) -host_mem_usage 154868 # Number of bytes of host memory used -host_seconds 0.73 # Real time elapsed on the host -host_tick_rate 1956796 # Simulator tick rate (ticks/s) +host_inst_rate 4388 # Simulator instruction rate (inst/s) +host_mem_usage 179936 # Number of bytes of host memory used +host_seconds 2.50 # Real time elapsed on the host +host_tick_rate 568121 # Simulator tick rate (ticks/s) memdepunit.memDep.conflictingLoads 12 # Number of conflicting loads. memdepunit.memDep.conflictingStores 0 # Number of conflicting stores. memdepunit.memDep.insertedLoads 3250 # Number of loads inserted to the mem dependence unit. @@ -19,22 +19,22 @@ memdepunit.memDep.insertedStores 2817 # Nu sim_freq 1000000000000 # Frequency of simulated ticks sim_insts 10976 # Number of instructions simulated sim_seconds 0.000001 # Number of seconds simulated -sim_ticks 1421211 # Number of ticks simulated +sim_ticks 1421207 # Number of ticks simulated system.cpu.commit.COM:branches 2152 # Number of branches committed -system.cpu.commit.COM:bw_lim_events 172 # number cycles where commit BW limit reached +system.cpu.commit.COM:bw_lim_events 225 # number cycles where commit BW limit reached system.cpu.commit.COM:bw_limited 0 # number of insts not committed due to BW limits system.cpu.commit.COM:committed_per_cycle.start_dist # Number of insts commited each cycle -system.cpu.commit.COM:committed_per_cycle.samples 221349 +system.cpu.commit.COM:committed_per_cycle.samples 220766 system.cpu.commit.COM:committed_per_cycle.min_value 0 - 0 215844 9751.30% - 1 2970 134.18% - 2 1290 58.28% - 3 631 28.51% - 4 208 9.40% - 5 90 4.07% - 6 133 6.01% + 0 215368 9755.49% + 1 2915 132.04% + 2 1196 54.18% + 3 673 30.48% + 4 208 9.42% + 5 79 3.58% + 6 91 4.12% 7 11 0.50% - 8 172 7.77% + 8 225 10.19% system.cpu.commit.COM:committed_per_cycle.max_value 8 system.cpu.commit.COM:committed_per_cycle.end_dist @@ -49,65 +49,65 @@ system.cpu.commit.commitNonSpecStalls 327 # Th system.cpu.commit.commitSquashedInsts 14263 # The number of squashed insts skipped by commit system.cpu.committedInsts 10976 # Number of Instructions Simulated system.cpu.committedInsts_total 10976 # Number of Instructions Simulated -system.cpu.cpi 129.483509 # CPI: Cycles Per Instruction -system.cpu.cpi_total 129.483509 # CPI: Total CPI of All Threads -system.cpu.dcache.ReadReq_accesses 2737 # number of ReadReq accesses(hits+misses) -system.cpu.dcache.ReadReq_avg_miss_latency 6585.044776 # average ReadReq miss latency -system.cpu.dcache.ReadReq_avg_mshr_miss_latency 6511.939394 # average ReadReq mshr miss latency -system.cpu.dcache.ReadReq_hits 2603 # number of ReadReq hits -system.cpu.dcache.ReadReq_miss_latency 882396 # number of ReadReq miss cycles -system.cpu.dcache.ReadReq_miss_rate 0.048959 # miss rate for ReadReq accesses +system.cpu.cpi 129.483145 # CPI: Cycles Per Instruction +system.cpu.cpi_total 129.483145 # CPI: Total CPI of All Threads +system.cpu.dcache.ReadReq_accesses 2738 # number of ReadReq accesses(hits+misses) +system.cpu.dcache.ReadReq_avg_miss_latency 6586.074627 # average ReadReq miss latency +system.cpu.dcache.ReadReq_avg_mshr_miss_latency 6513.166667 # average ReadReq mshr miss latency +system.cpu.dcache.ReadReq_hits 2604 # number of ReadReq hits +system.cpu.dcache.ReadReq_miss_latency 882534 # number of ReadReq miss cycles +system.cpu.dcache.ReadReq_miss_rate 0.048941 # miss rate for ReadReq accesses system.cpu.dcache.ReadReq_misses 134 # number of ReadReq misses system.cpu.dcache.ReadReq_mshr_hits 68 # number of ReadReq MSHR hits -system.cpu.dcache.ReadReq_mshr_miss_latency 429788 # number of ReadReq MSHR miss cycles -system.cpu.dcache.ReadReq_mshr_miss_rate 0.024114 # mshr miss rate for ReadReq accesses +system.cpu.dcache.ReadReq_mshr_miss_latency 429869 # number of ReadReq MSHR miss cycles +system.cpu.dcache.ReadReq_mshr_miss_rate 0.024105 # mshr miss rate for ReadReq accesses system.cpu.dcache.ReadReq_mshr_misses 66 # number of ReadReq MSHR misses system.cpu.dcache.SwapReq_accesses 6 # number of SwapReq accesses(hits+misses) system.cpu.dcache.SwapReq_hits 6 # number of SwapReq hits system.cpu.dcache.WriteReq_accesses 1292 # number of WriteReq accesses(hits+misses) -system.cpu.dcache.WriteReq_avg_miss_latency 7960.583924 # average WriteReq miss latency -system.cpu.dcache.WriteReq_avg_mshr_miss_latency 7136.918605 # average WriteReq mshr miss latency +system.cpu.dcache.WriteReq_avg_miss_latency 7962.583924 # average WriteReq miss latency +system.cpu.dcache.WriteReq_avg_mshr_miss_latency 7138.593023 # average WriteReq mshr miss latency system.cpu.dcache.WriteReq_hits 869 # number of WriteReq hits -system.cpu.dcache.WriteReq_miss_latency 3367327 # number of WriteReq miss cycles +system.cpu.dcache.WriteReq_miss_latency 3368173 # number of WriteReq miss cycles system.cpu.dcache.WriteReq_miss_rate 0.327399 # miss rate for WriteReq accesses system.cpu.dcache.WriteReq_misses 423 # number of WriteReq misses system.cpu.dcache.WriteReq_mshr_hits 337 # number of WriteReq MSHR hits -system.cpu.dcache.WriteReq_mshr_miss_latency 613775 # number of WriteReq MSHR miss cycles +system.cpu.dcache.WriteReq_mshr_miss_latency 613919 # number of WriteReq MSHR miss cycles system.cpu.dcache.WriteReq_mshr_miss_rate 0.066563 # mshr miss rate for WriteReq accesses system.cpu.dcache.WriteReq_mshr_misses 86 # number of WriteReq MSHR misses system.cpu.dcache.avg_blocked_cycles_no_mshrs # average number of cycles each access was blocked system.cpu.dcache.avg_blocked_cycles_no_targets # average number of cycles each access was blocked -system.cpu.dcache.avg_refs 22.881579 # Average number of references to valid blocks. +system.cpu.dcache.avg_refs 22.888158 # Average number of references to valid blocks. system.cpu.dcache.blocked_no_mshrs 0 # number of cycles access was blocked system.cpu.dcache.blocked_no_targets 0 # number of cycles access was blocked system.cpu.dcache.blocked_cycles_no_mshrs 0 # number of cycles access was blocked system.cpu.dcache.blocked_cycles_no_targets 0 # number of cycles access was blocked system.cpu.dcache.cache_copies 0 # number of cache copies performed -system.cpu.dcache.demand_accesses 4029 # number of demand (read+write) accesses -system.cpu.dcache.demand_avg_miss_latency 7629.664273 # average overall miss latency -system.cpu.dcache.demand_avg_mshr_miss_latency 6865.546053 # average overall mshr miss latency -system.cpu.dcache.demand_hits 3472 # number of demand (read+write) hits -system.cpu.dcache.demand_miss_latency 4249723 # number of demand (read+write) miss cycles -system.cpu.dcache.demand_miss_rate 0.138248 # miss rate for demand accesses +system.cpu.dcache.demand_accesses 4030 # number of demand (read+write) accesses +system.cpu.dcache.demand_avg_miss_latency 7631.430880 # average overall miss latency +system.cpu.dcache.demand_avg_mshr_miss_latency 6867.026316 # average overall mshr miss latency +system.cpu.dcache.demand_hits 3473 # number of demand (read+write) hits +system.cpu.dcache.demand_miss_latency 4250707 # number of demand (read+write) miss cycles +system.cpu.dcache.demand_miss_rate 0.138213 # miss rate for demand accesses system.cpu.dcache.demand_misses 557 # number of demand (read+write) misses system.cpu.dcache.demand_mshr_hits 405 # number of demand (read+write) MSHR hits -system.cpu.dcache.demand_mshr_miss_latency 1043563 # number of demand (read+write) MSHR miss cycles -system.cpu.dcache.demand_mshr_miss_rate 0.037726 # mshr miss rate for demand accesses +system.cpu.dcache.demand_mshr_miss_latency 1043788 # number of demand (read+write) MSHR miss cycles +system.cpu.dcache.demand_mshr_miss_rate 0.037717 # mshr miss rate for demand accesses system.cpu.dcache.demand_mshr_misses 152 # number of demand (read+write) MSHR misses system.cpu.dcache.fast_writes 0 # number of fast writes performed system.cpu.dcache.mshr_cap_events 0 # number of times MSHR cap was activated system.cpu.dcache.no_allocate_misses 0 # Number of misses that were no-allocate -system.cpu.dcache.overall_accesses 4029 # number of overall (read+write) accesses -system.cpu.dcache.overall_avg_miss_latency 7629.664273 # average overall miss latency -system.cpu.dcache.overall_avg_mshr_miss_latency 6865.546053 # average overall mshr miss latency +system.cpu.dcache.overall_accesses 4030 # number of overall (read+write) accesses +system.cpu.dcache.overall_avg_miss_latency 7631.430880 # average overall miss latency +system.cpu.dcache.overall_avg_mshr_miss_latency 6867.026316 # average overall mshr miss latency system.cpu.dcache.overall_avg_mshr_uncacheable_latency # average overall mshr uncacheable latency -system.cpu.dcache.overall_hits 3472 # number of overall hits -system.cpu.dcache.overall_miss_latency 4249723 # number of overall miss cycles -system.cpu.dcache.overall_miss_rate 0.138248 # miss rate for overall accesses +system.cpu.dcache.overall_hits 3473 # number of overall hits +system.cpu.dcache.overall_miss_latency 4250707 # number of overall miss cycles +system.cpu.dcache.overall_miss_rate 0.138213 # miss rate for overall accesses system.cpu.dcache.overall_misses 557 # number of overall misses system.cpu.dcache.overall_mshr_hits 405 # number of overall MSHR hits -system.cpu.dcache.overall_mshr_miss_latency 1043563 # number of overall MSHR miss cycles -system.cpu.dcache.overall_mshr_miss_rate 0.037726 # mshr miss rate for overall accesses +system.cpu.dcache.overall_mshr_miss_latency 1043788 # number of overall MSHR miss cycles +system.cpu.dcache.overall_mshr_miss_rate 0.037717 # mshr miss rate for overall accesses system.cpu.dcache.overall_mshr_misses 152 # number of overall MSHR misses system.cpu.dcache.overall_mshr_uncacheable_latency 0 # number of overall MSHR uncacheable cycles system.cpu.dcache.overall_mshr_uncacheable_misses 0 # number of overall MSHR uncacheable misses @@ -123,50 +123,50 @@ system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss 0 system.cpu.dcache.replacements 0 # number of replacements system.cpu.dcache.sampled_refs 152 # Sample count of references to valid blocks. system.cpu.dcache.soft_prefetch_mshr_full 0 # number of mshr full events for SW prefetching instrutions -system.cpu.dcache.tagsinuse 90.938737 # Cycle average of tags in use -system.cpu.dcache.total_refs 3478 # Total number of references to valid blocks. +system.cpu.dcache.tagsinuse 90.938565 # Cycle average of tags in use +system.cpu.dcache.total_refs 3479 # Total number of references to valid blocks. system.cpu.dcache.warmup_cycle 0 # Cycle when the warmup percentage was hit. system.cpu.dcache.writebacks 0 # number of writebacks -system.cpu.decode.DECODE:BlockedCycles 192719 # Number of cycles decode is blocked -system.cpu.decode.DECODE:DecodedInsts 39774 # Number of instructions handled by decode -system.cpu.decode.DECODE:IdleCycles 20128 # Number of cycles decode is idle -system.cpu.decode.DECODE:RunCycles 8238 # Number of cycles decode is running +system.cpu.decode.DECODE:BlockedCycles 192302 # Number of cycles decode is blocked +system.cpu.decode.DECODE:DecodedInsts 39763 # Number of instructions handled by decode +system.cpu.decode.DECODE:IdleCycles 19973 # Number of cycles decode is idle +system.cpu.decode.DECODE:RunCycles 8441 # Number of cycles decode is running system.cpu.decode.DECODE:SquashCycles 3162 # Number of cycles decode is squashing -system.cpu.decode.DECODE:UnblockCycles 264 # Number of cycles decode is unblocking -system.cpu.fetch.Branches 7846 # Number of branches that fetch encountered +system.cpu.decode.DECODE:UnblockCycles 50 # Number of cycles decode is unblocking +system.cpu.fetch.Branches 7877 # Number of branches that fetch encountered system.cpu.fetch.CacheLines 5085 # Number of cache lines fetched -system.cpu.fetch.Cycles 14399 # Number of cycles fetch has run and was not squashing or blocked +system.cpu.fetch.Cycles 14430 # Number of cycles fetch has run and was not squashing or blocked system.cpu.fetch.IcacheSquashes 745 # Number of outstanding Icache misses that were squashed -system.cpu.fetch.Insts 43304 # Number of instructions fetch has processed +system.cpu.fetch.Insts 43366 # Number of instructions fetch has processed system.cpu.fetch.SquashCycles 2134 # Number of cycles fetch has spent squashing -system.cpu.fetch.branchRate 0.034947 # Number of branch fetches per cycle +system.cpu.fetch.branchRate 0.035176 # Number of branch fetches per cycle system.cpu.fetch.icacheStallCycles 5085 # Number of cycles fetch is stalled on an Icache miss -system.cpu.fetch.predictedBranches 2990 # Number of branches that fetch has predicted taken -system.cpu.fetch.rate 0.192881 # Number of inst fetches per cycle +system.cpu.fetch.predictedBranches 3021 # Number of branches that fetch has predicted taken +system.cpu.fetch.rate 0.193660 # Number of inst fetches per cycle system.cpu.fetch.rateDist.start_dist # Number of instructions fetched each cycle (Total) -system.cpu.fetch.rateDist.samples 224511 +system.cpu.fetch.rateDist.samples 223928 system.cpu.fetch.rateDist.min_value 0 - 0 215198 9585.19% - 1 2258 100.57% - 2 627 27.93% - 3 958 42.67% - 4 553 24.63% - 5 816 36.35% - 6 951 42.36% - 7 280 12.47% - 8 2870 127.83% + 0 214584 9582.72% + 1 2258 100.84% + 2 658 29.38% + 3 958 42.78% + 4 553 24.70% + 5 816 36.44% + 6 951 42.47% + 7 280 12.50% + 8 2870 128.17% system.cpu.fetch.rateDist.max_value 8 system.cpu.fetch.rateDist.end_dist system.cpu.icache.ReadReq_accesses 5085 # number of ReadReq accesses(hits+misses) -system.cpu.icache.ReadReq_avg_miss_latency 5148.266776 # average ReadReq miss latency -system.cpu.icache.ReadReq_avg_mshr_miss_latency 4502.972752 # average ReadReq mshr miss latency +system.cpu.icache.ReadReq_avg_miss_latency 5150.152209 # average ReadReq miss latency +system.cpu.icache.ReadReq_avg_mshr_miss_latency 4503.673025 # average ReadReq mshr miss latency system.cpu.icache.ReadReq_hits 4474 # number of ReadReq hits -system.cpu.icache.ReadReq_miss_latency 3145591 # number of ReadReq miss cycles +system.cpu.icache.ReadReq_miss_latency 3146743 # number of ReadReq miss cycles system.cpu.icache.ReadReq_miss_rate 0.120157 # miss rate for ReadReq accesses system.cpu.icache.ReadReq_misses 611 # number of ReadReq misses system.cpu.icache.ReadReq_mshr_hits 244 # number of ReadReq MSHR hits -system.cpu.icache.ReadReq_mshr_miss_latency 1652591 # number of ReadReq MSHR miss cycles +system.cpu.icache.ReadReq_mshr_miss_latency 1652848 # number of ReadReq MSHR miss cycles system.cpu.icache.ReadReq_mshr_miss_rate 0.072173 # mshr miss rate for ReadReq accesses system.cpu.icache.ReadReq_mshr_misses 367 # number of ReadReq MSHR misses system.cpu.icache.avg_blocked_cycles_no_mshrs # average number of cycles each access was blocked @@ -178,29 +178,29 @@ system.cpu.icache.blocked_cycles_no_mshrs 0 # n system.cpu.icache.blocked_cycles_no_targets 0 # number of cycles access was blocked system.cpu.icache.cache_copies 0 # number of cache copies performed system.cpu.icache.demand_accesses 5085 # number of demand (read+write) accesses -system.cpu.icache.demand_avg_miss_latency 5148.266776 # average overall miss latency -system.cpu.icache.demand_avg_mshr_miss_latency 4502.972752 # average overall mshr miss latency +system.cpu.icache.demand_avg_miss_latency 5150.152209 # average overall miss latency +system.cpu.icache.demand_avg_mshr_miss_latency 4503.673025 # average overall mshr miss latency system.cpu.icache.demand_hits 4474 # number of demand (read+write) hits -system.cpu.icache.demand_miss_latency 3145591 # number of demand (read+write) miss cycles +system.cpu.icache.demand_miss_latency 3146743 # number of demand (read+write) miss cycles system.cpu.icache.demand_miss_rate 0.120157 # miss rate for demand accesses system.cpu.icache.demand_misses 611 # number of demand (read+write) misses system.cpu.icache.demand_mshr_hits 244 # number of demand (read+write) MSHR hits -system.cpu.icache.demand_mshr_miss_latency 1652591 # number of demand (read+write) MSHR miss cycles +system.cpu.icache.demand_mshr_miss_latency 1652848 # number of demand (read+write) MSHR miss cycles system.cpu.icache.demand_mshr_miss_rate 0.072173 # mshr miss rate for demand accesses system.cpu.icache.demand_mshr_misses 367 # number of demand (read+write) MSHR misses system.cpu.icache.fast_writes 0 # number of fast writes performed system.cpu.icache.mshr_cap_events 0 # number of times MSHR cap was activated system.cpu.icache.no_allocate_misses 0 # Number of misses that were no-allocate system.cpu.icache.overall_accesses 5085 # number of overall (read+write) accesses -system.cpu.icache.overall_avg_miss_latency 5148.266776 # average overall miss latency -system.cpu.icache.overall_avg_mshr_miss_latency 4502.972752 # average overall mshr miss latency +system.cpu.icache.overall_avg_miss_latency 5150.152209 # average overall miss latency +system.cpu.icache.overall_avg_mshr_miss_latency 4503.673025 # average overall mshr miss latency system.cpu.icache.overall_avg_mshr_uncacheable_latency # average overall mshr uncacheable latency system.cpu.icache.overall_hits 4474 # number of overall hits -system.cpu.icache.overall_miss_latency 3145591 # number of overall miss cycles +system.cpu.icache.overall_miss_latency 3146743 # number of overall miss cycles system.cpu.icache.overall_miss_rate 0.120157 # miss rate for overall accesses system.cpu.icache.overall_misses 611 # number of overall misses system.cpu.icache.overall_mshr_hits 244 # number of overall MSHR hits -system.cpu.icache.overall_mshr_miss_latency 1652591 # number of overall MSHR miss cycles +system.cpu.icache.overall_mshr_miss_latency 1652848 # number of overall MSHR miss cycles system.cpu.icache.overall_mshr_miss_rate 0.072173 # mshr miss rate for overall accesses system.cpu.icache.overall_mshr_misses 367 # number of overall MSHR misses system.cpu.icache.overall_mshr_uncacheable_latency 0 # number of overall MSHR uncacheable cycles @@ -217,35 +217,35 @@ system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss 0 system.cpu.icache.replacements 1 # number of replacements system.cpu.icache.sampled_refs 363 # Sample count of references to valid blocks. system.cpu.icache.soft_prefetch_mshr_full 0 # number of mshr full events for SW prefetching instrutions -system.cpu.icache.tagsinuse 172.869174 # Cycle average of tags in use +system.cpu.icache.tagsinuse 172.868641 # Cycle average of tags in use system.cpu.icache.total_refs 4474 # Total number of references to valid blocks. system.cpu.icache.warmup_cycle 0 # Cycle when the warmup percentage was hit. system.cpu.icache.writebacks 0 # number of writebacks -system.cpu.idleCycles 1196701 # Total number of cycles that the CPU has spent unscheduled due to idling -system.cpu.iew.EXEC:branches 3576 # Number of branches executed +system.cpu.idleCycles 1197280 # Total number of cycles that the CPU has spent unscheduled due to idling +system.cpu.iew.EXEC:branches 3577 # Number of branches executed system.cpu.iew.EXEC:nop 0 # number of nop insts executed -system.cpu.iew.EXEC:rate 0.092548 # Inst execution rate -system.cpu.iew.EXEC:refs 5257 # number of memory reference insts executed +system.cpu.iew.EXEC:rate 0.092802 # Inst execution rate +system.cpu.iew.EXEC:refs 5258 # number of memory reference insts executed system.cpu.iew.EXEC:stores 2386 # Number of stores executed system.cpu.iew.EXEC:swp 0 # number of swp insts executed system.cpu.iew.WB:consumers 9737 # num instructions consuming a value -system.cpu.iew.WB:count 19769 # cumulative count of insts written-back +system.cpu.iew.WB:count 19771 # cumulative count of insts written-back system.cpu.iew.WB:fanout 0.790901 # average fanout of values written-back system.cpu.iew.WB:penalized 0 # number of instrctions required to write to 'other' IQ system.cpu.iew.WB:penalized_rate 0 # fraction of instructions written-back that wrote to 'other' IQ system.cpu.iew.WB:producers 7701 # num instructions producing a value -system.cpu.iew.WB:rate 0.088054 # insts written-back per cycle -system.cpu.iew.WB:sent 20061 # cumulative count of insts sent to commit -system.cpu.iew.branchMispredicts 2593 # Number of branch mispredicts detected at execute +system.cpu.iew.WB:rate 0.088292 # insts written-back per cycle +system.cpu.iew.WB:sent 20063 # cumulative count of insts sent to commit +system.cpu.iew.branchMispredicts 2594 # Number of branch mispredicts detected at execute system.cpu.iew.iewBlockCycles 476 # Number of cycles IEW is blocking system.cpu.iew.iewDispLoadInsts 3250 # Number of dispatched load instructions system.cpu.iew.iewDispNonSpecInsts 617 # Number of dispatched non-speculative instructions -system.cpu.iew.iewDispSquashedInsts 2705 # Number of squashed instructions skipped by dispatch +system.cpu.iew.iewDispSquashedInsts 2694 # Number of squashed instructions skipped by dispatch system.cpu.iew.iewDispStoreInsts 2817 # Number of dispatched store instructions system.cpu.iew.iewDispatchedInsts 25240 # Number of instructions dispatched to IQ -system.cpu.iew.iewExecLoadInsts 2871 # Number of load instructions executed -system.cpu.iew.iewExecSquashedInsts 1780 # Number of squashed instructions skipped in execute -system.cpu.iew.iewExecutedInsts 20778 # Number of executed instructions +system.cpu.iew.iewExecLoadInsts 2872 # Number of load instructions executed +system.cpu.iew.iewExecSquashedInsts 1777 # Number of squashed instructions skipped in execute +system.cpu.iew.iewExecutedInsts 20781 # Number of executed instructions system.cpu.iew.iewIQFullEvents 7 # Number of times the IQ has become full, causing a stall system.cpu.iew.iewIdleCycles 0 # Number of cycles IEW is idle system.cpu.iew.iewLSQFullEvents 0 # Number of times the LSQ has become full, causing a stall @@ -262,7 +262,7 @@ system.cpu.iew.lsq.thread.0.rescheduledLoads 0 system.cpu.iew.lsq.thread.0.squashedLoads 1788 # Number of loads squashed system.cpu.iew.lsq.thread.0.squashedStores 1519 # Number of stores squashed system.cpu.iew.memOrderViolationEvents 54 # Number of memory order violations -system.cpu.iew.predictedNotTakenIncorrect 962 # Number of branches that were predicted not taken incorrectly +system.cpu.iew.predictedNotTakenIncorrect 963 # Number of branches that were predicted not taken incorrectly system.cpu.iew.predictedTakenIncorrect 1631 # Number of branches that were predicted taken incorrectly system.cpu.ipc 0.007723 # IPC: Instructions Per Cycle system.cpu.ipc_total 0.007723 # IPC: Total IPC of All Threads @@ -302,21 +302,21 @@ system.cpu.iq.ISSUE:fu_full.start_dist InstPrefetch 0 0.00% # attempts to use FU when none available system.cpu.iq.ISSUE:fu_full.end_dist system.cpu.iq.ISSUE:issued_per_cycle.start_dist # Number of insts issued each cycle -system.cpu.iq.ISSUE:issued_per_cycle.samples 224511 +system.cpu.iq.ISSUE:issued_per_cycle.samples 223928 system.cpu.iq.ISSUE:issued_per_cycle.min_value 0 - 0 215315 9590.40% - 1 4124 183.69% - 2 1297 57.77% - 3 1306 58.17% - 4 1190 53.00% - 5 707 31.49% - 6 433 19.29% - 7 83 3.70% - 8 56 2.49% + 0 214838 9594.07% + 1 3976 177.56% + 2 1244 55.55% + 3 1359 60.69% + 4 1316 58.77% + 5 612 27.33% + 6 444 19.83% + 7 83 3.71% + 8 56 2.50% system.cpu.iq.ISSUE:issued_per_cycle.max_value 8 system.cpu.iq.ISSUE:issued_per_cycle.end_dist -system.cpu.iq.ISSUE:rate 0.100476 # Inst issue rate +system.cpu.iq.ISSUE:rate 0.100738 # Inst issue rate system.cpu.iq.iqInstsAdded 24623 # Number of instructions added to the IQ (excludes non-spec) system.cpu.iq.iqInstsIssued 22558 # Number of instructions issued system.cpu.iq.iqNonSpecInstsAdded 617 # Number of non-speculative instructions added to the IQ @@ -325,12 +325,12 @@ system.cpu.iq.iqSquashedInstsIssued 174 # Nu system.cpu.iq.iqSquashedNonSpecRemoved 290 # Number of squashed non-spec instructions that were removed system.cpu.iq.iqSquashedOperandsExamined 5834 # Number of squashed operands that are examined and possibly removed from graph system.cpu.l2cache.ReadReq_accesses 513 # number of ReadReq accesses(hits+misses) -system.cpu.l2cache.ReadReq_avg_miss_latency 4754.779727 # average ReadReq miss latency -system.cpu.l2cache.ReadReq_avg_mshr_miss_latency 2343.506823 # average ReadReq mshr miss latency -system.cpu.l2cache.ReadReq_miss_latency 2439202 # number of ReadReq miss cycles +system.cpu.l2cache.ReadReq_avg_miss_latency 4755.715400 # average ReadReq miss latency +system.cpu.l2cache.ReadReq_avg_mshr_miss_latency 2343.752437 # average ReadReq mshr miss latency +system.cpu.l2cache.ReadReq_miss_latency 2439682 # number of ReadReq miss cycles system.cpu.l2cache.ReadReq_miss_rate 1 # miss rate for ReadReq accesses system.cpu.l2cache.ReadReq_misses 513 # number of ReadReq misses -system.cpu.l2cache.ReadReq_mshr_miss_latency 1202219 # number of ReadReq MSHR miss cycles +system.cpu.l2cache.ReadReq_mshr_miss_latency 1202345 # number of ReadReq MSHR miss cycles system.cpu.l2cache.ReadReq_mshr_miss_rate 1 # mshr miss rate for ReadReq accesses system.cpu.l2cache.ReadReq_mshr_misses 513 # number of ReadReq MSHR misses system.cpu.l2cache.avg_blocked_cycles_no_mshrs # average number of cycles each access was blocked @@ -342,29 +342,29 @@ system.cpu.l2cache.blocked_cycles_no_mshrs 0 # system.cpu.l2cache.blocked_cycles_no_targets 0 # number of cycles access was blocked system.cpu.l2cache.cache_copies 0 # number of cache copies performed system.cpu.l2cache.demand_accesses 513 # number of demand (read+write) accesses -system.cpu.l2cache.demand_avg_miss_latency 4754.779727 # average overall miss latency -system.cpu.l2cache.demand_avg_mshr_miss_latency 2343.506823 # average overall mshr miss latency +system.cpu.l2cache.demand_avg_miss_latency 4755.715400 # average overall miss latency +system.cpu.l2cache.demand_avg_mshr_miss_latency 2343.752437 # average overall mshr miss latency system.cpu.l2cache.demand_hits 0 # number of demand (read+write) hits -system.cpu.l2cache.demand_miss_latency 2439202 # number of demand (read+write) miss cycles +system.cpu.l2cache.demand_miss_latency 2439682 # number of demand (read+write) miss cycles system.cpu.l2cache.demand_miss_rate 1 # miss rate for demand accesses system.cpu.l2cache.demand_misses 513 # number of demand (read+write) misses system.cpu.l2cache.demand_mshr_hits 0 # number of demand (read+write) MSHR hits -system.cpu.l2cache.demand_mshr_miss_latency 1202219 # number of demand (read+write) MSHR miss cycles +system.cpu.l2cache.demand_mshr_miss_latency 1202345 # number of demand (read+write) MSHR miss cycles system.cpu.l2cache.demand_mshr_miss_rate 1 # mshr miss rate for demand accesses system.cpu.l2cache.demand_mshr_misses 513 # number of demand (read+write) MSHR misses system.cpu.l2cache.fast_writes 0 # number of fast writes performed system.cpu.l2cache.mshr_cap_events 0 # number of times MSHR cap was activated system.cpu.l2cache.no_allocate_misses 0 # Number of misses that were no-allocate system.cpu.l2cache.overall_accesses 513 # number of overall (read+write) accesses -system.cpu.l2cache.overall_avg_miss_latency 4754.779727 # average overall miss latency -system.cpu.l2cache.overall_avg_mshr_miss_latency 2343.506823 # average overall mshr miss latency +system.cpu.l2cache.overall_avg_miss_latency 4755.715400 # average overall miss latency +system.cpu.l2cache.overall_avg_mshr_miss_latency 2343.752437 # average overall mshr miss latency system.cpu.l2cache.overall_avg_mshr_uncacheable_latency # average overall mshr uncacheable latency system.cpu.l2cache.overall_hits 0 # number of overall hits -system.cpu.l2cache.overall_miss_latency 2439202 # number of overall miss cycles +system.cpu.l2cache.overall_miss_latency 2439682 # number of overall miss cycles system.cpu.l2cache.overall_miss_rate 1 # miss rate for overall accesses system.cpu.l2cache.overall_misses 513 # number of overall misses system.cpu.l2cache.overall_mshr_hits 0 # number of overall MSHR hits -system.cpu.l2cache.overall_mshr_miss_latency 1202219 # number of overall MSHR miss cycles +system.cpu.l2cache.overall_mshr_miss_latency 1202345 # number of overall MSHR miss cycles system.cpu.l2cache.overall_mshr_miss_rate 1 # mshr miss rate for overall accesses system.cpu.l2cache.overall_mshr_misses 513 # number of overall MSHR misses system.cpu.l2cache.overall_mshr_uncacheable_latency 0 # number of overall MSHR uncacheable cycles @@ -381,28 +381,27 @@ system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss 0 system.cpu.l2cache.replacements 0 # number of replacements system.cpu.l2cache.sampled_refs 512 # Sample count of references to valid blocks. system.cpu.l2cache.soft_prefetch_mshr_full 0 # number of mshr full events for SW prefetching instrutions -system.cpu.l2cache.tagsinuse 262.946375 # Cycle average of tags in use +system.cpu.l2cache.tagsinuse 262.945674 # Cycle average of tags in use system.cpu.l2cache.total_refs 0 # Total number of references to valid blocks. system.cpu.l2cache.warmup_cycle 0 # Cycle when the warmup percentage was hit. system.cpu.l2cache.writebacks 0 # number of writebacks -system.cpu.numCycles 224511 # number of cpu cycles simulated +system.cpu.numCycles 223928 # number of cpu cycles simulated system.cpu.rename.RENAME:BlockCycles 960 # Number of cycles rename is blocking system.cpu.rename.RENAME:CommittedMaps 9868 # Number of HB maps that are committed system.cpu.rename.RENAME:IQFullEvents 2 # Number of times rename has blocked due to IQ full -system.cpu.rename.RENAME:IdleCycles 20098 # Number of cycles rename is idle -system.cpu.rename.RENAME:LSQFullEvents 481 # Number of times rename has blocked due to LSQ full +system.cpu.rename.RENAME:IdleCycles 21302 # Number of cycles rename is idle +system.cpu.rename.RENAME:LSQFullEvents 411 # Number of times rename has blocked due to LSQ full system.cpu.rename.RENAME:ROBFullEvents 4 # Number of times rename has blocked due to ROB full system.cpu.rename.RENAME:RenameLookups 46931 # Number of register rename lookups that rename has made -system.cpu.rename.RENAME:RenamedInsts 31260 # Number of instructions processed by rename +system.cpu.rename.RENAME:RenamedInsts 31249 # Number of instructions processed by rename system.cpu.rename.RENAME:RenamedOperands 25831 # Number of destination operands rename has renamed -system.cpu.rename.RENAME:RunCycles 7921 # Number of cycles rename is running +system.cpu.rename.RENAME:RunCycles 7136 # Number of cycles rename is running system.cpu.rename.RENAME:SquashCycles 3162 # Number of cycles rename is squashing -system.cpu.rename.RENAME:SquashedInsts 8042 # Number of squashed instructions processed by rename -system.cpu.rename.RENAME:UnblockCycles 1212 # Number of cycles rename is unblocking +system.cpu.rename.RENAME:UnblockCycles 614 # Number of cycles rename is unblocking system.cpu.rename.RENAME:UndoneMaps 15963 # Number of HB maps that are undone due to squashing -system.cpu.rename.RENAME:serializeStallCycles 190573 # count of cycles rename stalled for serializing inst +system.cpu.rename.RENAME:serializeStallCycles 190754 # count of cycles rename stalled for serializing inst system.cpu.rename.RENAME:serializingInsts 638 # count of serializing insts renamed -system.cpu.rename.RENAME:skidInsts 5594 # count of insts added to the skid buffer +system.cpu.rename.RENAME:skidInsts 5529 # count of insts added to the skid buffer system.cpu.rename.RENAME:tempSerializingInsts 629 # count of temporary serializing insts renamed system.cpu.timesIdled 289 # Number of times that the entire CPU went into an idle state and unscheduled itself system.cpu.workload.PROG:num_syscalls 8 # Number of system calls diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr index 48affb0e2..7873672f2 100644 --- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr +++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr @@ -1,4 +1,3 @@ warn: More than two loadable segments in ELF object. warn: Ignoring segment @ 0x0 length 0x0. -0: system.remote_gdb.listener: listening for remote gdb on port 7003 warn: Entering event queue @ 0. Starting simulation... diff --git a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout index 6cba2ba7e..38b0c1787 100644 --- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout +++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout @@ -16,9 +16,9 @@ The Regents of The University of Michigan All Rights Reserved -M5 compiled Apr 9 2007 03:06:26 -M5 started Mon Apr 9 03:06:54 2007 -M5 executing on zizzer.eecs.umich.edu +M5 compiled Apr 13 2007 13:56:34 +M5 started Fri Apr 13 13:56:35 2007 +M5 executing on ahchoo.blinky.homelinux.org command line: build/SPARC_SE/m5.fast -d build/SPARC_SE/tests/fast/quick/02.insttest/sparc/linux/o3-timing tests/run.py quick/02.insttest/sparc/linux/o3-timing Global frequency set at 1000000000000 ticks per second -Exiting @ tick 1421211 because target called exit() +Exiting @ tick 1421207 because target called exit() From 3140dd88bc588ea51aadeb2dd58d33cc9a40883a Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Sat, 14 Apr 2007 17:07:24 +0000 Subject: [PATCH 3/7] Make the fsr a serializing register. Other control registers probably need this as well. --HG-- extra : convert_revision : edd3f9a83cc2722b6e0eff0eff4a8e034b0f6ec6 --- src/arch/sparc/isa/operands.isa | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arch/sparc/isa/operands.isa b/src/arch/sparc/isa/operands.isa index 58d616a7a..110b37d15 100644 --- a/src/arch/sparc/isa/operands.isa +++ b/src/arch/sparc/isa/operands.isa @@ -187,7 +187,7 @@ def operands {{ 'Hver': ('ControlReg', 'udw', 'MISCREG_HVER', None, 74), 'StrandStsReg': ('ControlReg', 'udw', 'MISCREG_STRAND_STS_REG', None, 75), - 'Fsr': ('ControlReg', 'udw', 'MISCREG_FSR', None, 80), + 'Fsr': ('ControlReg', 'udw', 'MISCREG_FSR', (None, None, ['IsSerializeAfter','IsSerializing','IsNonSpeculative']), 80), # Mem gets a large number so it's always last 'Mem': ('Mem', 'udw', None, ('IsMemRef', 'IsLoad', 'IsStore'), 100) From 5a3dcc172a9fd661330909815b163eb6f4d6a2d8 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Sat, 14 Apr 2007 17:08:24 +0000 Subject: [PATCH 4/7] Make register indexes larger so they can actually hold all the legal values. Oops! --HG-- extra : convert_revision : 7689b2e1f7468e4acb8be0f242f74002c79e7960 --- src/arch/sparc/types.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arch/sparc/types.hh b/src/arch/sparc/types.hh index 15386adca..8bd50b7e8 100644 --- a/src/arch/sparc/types.hh +++ b/src/arch/sparc/types.hh @@ -59,7 +59,7 @@ namespace SparcISA typedef int RegContextVal; - typedef uint8_t RegIndex; + typedef uint16_t RegIndex; } #endif From c3081d9c1c36e1a08c173048783d191fa19463de Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Sat, 14 Apr 2007 17:13:18 +0000 Subject: [PATCH 5/7] Add support for microcode and pull out the special branch delay slot handling. Branch delay slots need to be squash on a mispredict as well because the nnpc they saw was incorrect. --HG-- extra : convert_revision : 8b9c603616bcad254417a7a3fa3edfb4c8728719 --- src/cpu/base_dyn_inst.hh | 57 +++++++++++++++++-- src/cpu/base_dyn_inst_impl.hh | 55 +++++++++++++++++-- src/cpu/o3/comm.hh | 21 ++++--- src/cpu/o3/commit.hh | 34 +++++++++--- src/cpu/o3/commit_impl.hh | 11 ++-- src/cpu/o3/cpu.cc | 30 +++++++++- src/cpu/o3/cpu.hh | 22 ++++++-- src/cpu/o3/decode_impl.hh | 4 +- src/cpu/o3/fetch.hh | 17 +++--- src/cpu/o3/fetch_impl.hh | 91 ++++++++++++++++++++----------- src/cpu/o3/iew_impl.hh | 1 + src/cpu/o3/rename_impl.hh | 6 +- src/cpu/o3/sparc/dyn_inst.hh | 10 +++- src/cpu/o3/sparc/dyn_inst_impl.hh | 19 ++++++- 14 files changed, 292 insertions(+), 86 deletions(-) diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index b02038b3e..1311e5cf2 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -209,6 +209,9 @@ class BaseDynInst : public FastAlloc, public RefCounted /** PC of this instruction. */ Addr PC; + /** Micro PC of this instruction. */ + Addr microPC; + protected: /** Next non-speculative PC. It is not filled in at fetch, but rather * once the target of the branch is truly known (either decode or @@ -219,12 +222,18 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Next non-speculative NPC. Target PC for Mips or Sparc. */ Addr nextNPC; + /** Next non-speculative micro PC. */ + Addr nextMicroPC; + /** Predicted next PC. */ Addr predPC; /** Predicted next NPC. */ Addr predNPC; + /** Predicted next microPC */ + Addr predMicroPC; + /** If this is a branch that was predicted taken */ bool predTaken; @@ -340,6 +349,17 @@ class BaseDynInst : public FastAlloc, public RefCounted { _flatDestRegIdx[idx] = flattened_dest; } + /** BaseDynInst constructor given a binary instruction. + * @param staticInst A StaticInstPtr to the underlying instruction. + * @param PC The PC of the instruction. + * @param pred_PC The predicted next PC. + * @param pred_NPC The predicted next NPC. + * @param seq_num The sequence number of the instruction. + * @param cpu Pointer to the instruction's CPU. + */ + BaseDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC, + Addr pred_PC, Addr pred_NPC, Addr pred_MicroPC, + InstSeqNum seq_num, ImplCPU *cpu); /** BaseDynInst constructor given a binary instruction. * @param inst The binary instruction. @@ -349,8 +369,8 @@ class BaseDynInst : public FastAlloc, public RefCounted * @param seq_num The sequence number of the instruction. * @param cpu Pointer to the instruction's CPU. */ - BaseDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, - Addr pred_PC, Addr pred_NPC, + BaseDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, Addr microPC, + Addr pred_PC, Addr pred_NPC, Addr pred_MicroPC, InstSeqNum seq_num, ImplCPU *cpu); /** BaseDynInst constructor given a StaticInst pointer. @@ -402,11 +422,18 @@ class BaseDynInst : public FastAlloc, public RefCounted #endif } + Addr readNextMicroPC() + { + return nextMicroPC; + } + /** Set the predicted target of this current instruction. */ - void setPredTarg(Addr predicted_PC, Addr predicted_NPC) + void setPredTarg(Addr predicted_PC, Addr predicted_NPC, + Addr predicted_MicroPC) { predPC = predicted_PC; predNPC = predicted_NPC; + predMicroPC = predicted_MicroPC; } /** Returns the predicted PC immediately after the branch. */ @@ -415,6 +442,9 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Returns the predicted PC two instructions after the branch */ Addr readPredNPC() { return predNPC; } + /** Returns the predicted micro PC after the branch */ + Addr readPredMicroPC() { return predMicroPC; } + /** Returns whether the instruction was predicted taken or not. */ bool readPredTaken() { @@ -430,7 +460,8 @@ class BaseDynInst : public FastAlloc, public RefCounted bool mispredicted() { return readPredPC() != readNextPC() || - readPredNPC() != readNextNPC(); + readPredNPC() != readNextNPC() || + readPredMicroPC() != readNextMicroPC(); } // @@ -467,6 +498,12 @@ class BaseDynInst : public FastAlloc, public RefCounted bool isQuiesce() const { return staticInst->isQuiesce(); } bool isIprAccess() const { return staticInst->isIprAccess(); } bool isUnverifiable() const { return staticInst->isUnverifiable(); } + bool isMacroOp() const { return staticInst->isMacroOp(); } + bool isMicroOp() const { return staticInst->isMicroOp(); } + bool isDelayedCommit() const { return staticInst->isDelayedCommit(); } + bool isLastMicroOp() const { return staticInst->isLastMicroOp(); } + bool isFirstMicroOp() const { return staticInst->isFirstMicroOp(); } + bool isMicroBranch() const { return staticInst->isMicroBranch(); } /** Temporarily sets this instruction as a serialize before instruction. */ void setSerializeBefore() { status.set(SerializeBefore); } @@ -700,20 +737,28 @@ class BaseDynInst : public FastAlloc, public RefCounted /** Read the PC of this instruction. */ const Addr readPC() const { return PC; } + /**Read the micro PC of this instruction. */ + const Addr readMicroPC() const { return microPC; } + /** Set the next PC of this instruction (its actual target). */ - void setNextPC(uint64_t val) + void setNextPC(Addr val) { nextPC = val; } /** Set the next NPC of this instruction (the target in Mips or Sparc).*/ - void setNextNPC(uint64_t val) + void setNextNPC(Addr val) { #if ISA_HAS_DELAY_SLOT nextNPC = val; #endif } + void setNextMicroPC(Addr val) + { + nextMicroPC = val; + } + /** Sets the ASID. */ void setASID(short addr_space_id) { asid = addr_space_id; } diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh index a1c866336..acf8af9cf 100644 --- a/src/cpu/base_dyn_inst_impl.hh +++ b/src/cpu/base_dyn_inst_impl.hh @@ -62,19 +62,66 @@ my_hash_t thishash; #endif template -BaseDynInst::BaseDynInst(TheISA::ExtMachInst machInst, +BaseDynInst::BaseDynInst(StaticInstPtr _staticInst, Addr inst_PC, Addr inst_NPC, + Addr inst_MicroPC, Addr pred_PC, Addr pred_NPC, + Addr pred_MicroPC, InstSeqNum seq_num, ImplCPU *cpu) - : staticInst(machInst), traceData(NULL), cpu(cpu) + : staticInst(_staticInst), traceData(NULL), cpu(cpu) { seqNum = seq_num; + bool nextIsMicro = + staticInst->isMicroOp() && !staticInst->isLastMicroOp(); + PC = inst_PC; - nextPC = inst_NPC; - nextNPC = nextPC + sizeof(TheISA::MachInst); + microPC = inst_MicroPC; + if (nextIsMicro) { + nextPC = inst_PC; + nextNPC = inst_NPC; + nextMicroPC = microPC + 1; + } else { + nextPC = inst_NPC; + nextNPC = nextPC + sizeof(TheISA::MachInst); + nextMicroPC = 0; + } predPC = pred_PC; predNPC = pred_NPC; + predMicroPC = pred_MicroPC; + predTaken = false; + + initVars(); +} + +template +BaseDynInst::BaseDynInst(TheISA::ExtMachInst inst, + Addr inst_PC, Addr inst_NPC, + Addr inst_MicroPC, + Addr pred_PC, Addr pred_NPC, + Addr pred_MicroPC, + InstSeqNum seq_num, ImplCPU *cpu) + : staticInst(inst), traceData(NULL), cpu(cpu) +{ + seqNum = seq_num; + + bool nextIsMicro = + staticInst->isMicroOp() && !staticInst->isLastMicroOp(); + + PC = inst_PC; + microPC = inst_MicroPC; + if (nextIsMicro) { + nextPC = inst_PC; + nextNPC = inst_NPC; + nextMicroPC = microPC + 1; + } else { + nextPC = inst_NPC; + nextNPC = nextPC + sizeof(TheISA::MachInst); + nextMicroPC = 0; + } + predPC = pred_PC; + predNPC = pred_NPC; + predMicroPC = pred_MicroPC; predTaken = false; initVars(); diff --git a/src/cpu/o3/comm.hh b/src/cpu/o3/comm.hh index 8d7bb95f4..fb772060b 100644 --- a/src/cpu/o3/comm.hh +++ b/src/cpu/o3/comm.hh @@ -87,9 +87,10 @@ struct DefaultIEWDefaultCommit { bool squash[Impl::MaxThreads]; bool branchMispredict[Impl::MaxThreads]; bool branchTaken[Impl::MaxThreads]; - uint64_t mispredPC[Impl::MaxThreads]; - uint64_t nextPC[Impl::MaxThreads]; - uint64_t nextNPC[Impl::MaxThreads]; + Addr mispredPC[Impl::MaxThreads]; + Addr nextPC[Impl::MaxThreads]; + Addr nextNPC[Impl::MaxThreads]; + Addr nextMicroPC[Impl::MaxThreads]; InstSeqNum squashedSeqNum[Impl::MaxThreads]; bool includeSquashInst[Impl::MaxThreads]; @@ -118,9 +119,10 @@ struct TimeBufStruct { // struct as it is used pretty frequently. bool branchMispredict; bool branchTaken; - uint64_t mispredPC; - uint64_t nextPC; - uint64_t nextNPC; + Addr mispredPC; + Addr nextPC; + Addr nextNPC; + Addr nextMicroPC; unsigned branchCount; }; @@ -158,9 +160,10 @@ struct TimeBufStruct { bool branchMispredict; bool branchTaken; - uint64_t mispredPC; - uint64_t nextPC; - uint64_t nextNPC; + Addr mispredPC; + Addr nextPC; + Addr nextNPC; + Addr nextMicroPC; // Represents the instruction that has either been retired or // squashed. Similar to having a single bus that broadcasts the diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index fba618c14..27bdd20c5 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -279,25 +279,37 @@ class DefaultCommit /** Returns the PC of the head instruction of the ROB. * @todo: Probably remove this function as it returns only thread 0. */ - uint64_t readPC() { return PC[0]; } + Addr readPC() { return PC[0]; } /** Returns the PC of a specific thread. */ - uint64_t readPC(unsigned tid) { return PC[tid]; } + Addr readPC(unsigned tid) { return PC[tid]; } /** Sets the PC of a specific thread. */ - void setPC(uint64_t val, unsigned tid) { PC[tid] = val; } + void setPC(Addr val, unsigned tid) { PC[tid] = val; } + + /** Reads the micro PC of a specific thread. */ + Addr readMicroPC(unsigned tid) { return microPC[tid]; } + + /** Sets the micro PC of a specific thread */ + void setMicroPC(Addr val, unsigned tid) { microPC[tid] = val; } /** Reads the next PC of a specific thread. */ - uint64_t readNextPC(unsigned tid) { return nextPC[tid]; } + Addr readNextPC(unsigned tid) { return nextPC[tid]; } /** Sets the next PC of a specific thread. */ - void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; } + void setNextPC(Addr val, unsigned tid) { nextPC[tid] = val; } /** Reads the next NPC of a specific thread. */ - uint64_t readNextNPC(unsigned tid) { return nextNPC[tid]; } + Addr readNextNPC(unsigned tid) { return nextNPC[tid]; } /** Sets the next NPC of a specific thread. */ - void setNextNPC(uint64_t val, unsigned tid) { nextNPC[tid] = val; } + void setNextNPC(Addr val, unsigned tid) { nextNPC[tid] = val; } + + /** Reads the micro PC of a specific thread. */ + Addr readNextMicroPC(unsigned tid) { return nextMicroPC[tid]; } + + /** Sets the micro PC of a specific thread */ + void setNextMicroPC(Addr val, unsigned tid) { nextMicroPC[tid] = val; } private: /** Time buffer interface. */ @@ -402,12 +414,20 @@ class DefaultCommit */ Addr PC[Impl::MaxThreads]; + /** The commit micro PC of each thread. Refers to the instruction that + * is currently being processed/committed. + */ + Addr microPC[Impl::MaxThreads]; + /** The next PC of each thread. */ Addr nextPC[Impl::MaxThreads]; /** The next NPC of each thread. */ Addr nextNPC[Impl::MaxThreads]; + /** The next micro PC of each thread. */ + Addr nextMicroPC[Impl::MaxThreads]; + /** The sequence number of the youngest valid instruction in the ROB. */ InstSeqNum youngestSeqNum[Impl::MaxThreads]; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 9dd5ed291..fc24d7edc 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -124,7 +124,7 @@ DefaultCommit::DefaultCommit(O3CPU *_cpu, Params *params) committedStores[i] = false; trapSquash[i] = false; tcSquash[i] = false; - PC[i] = nextPC[i] = nextNPC[i] = 0; + microPC[i] = nextMicroPC[i] = PC[i] = nextPC[i] = nextNPC[i] = 0; } #if FULL_SYSTEM interrupt = NoFault; @@ -508,6 +508,7 @@ DefaultCommit::squashAll(unsigned tid) toIEW->commitInfo[tid].nextPC = PC[tid]; toIEW->commitInfo[tid].nextNPC = nextPC[tid]; + toIEW->commitInfo[tid].nextMicroPC = nextMicroPC[tid]; } template @@ -768,6 +769,7 @@ DefaultCommit::commit() toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid]; toIEW->commitInfo[tid].nextNPC = fromIEW->nextNPC[tid]; + toIEW->commitInfo[tid].nextMicroPC = fromIEW->nextMicroPC[tid]; toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid]; @@ -877,6 +879,7 @@ DefaultCommit::commitInsts() PC[tid] = head_inst->readPC(); nextPC[tid] = head_inst->readNextPC(); nextNPC[tid] = head_inst->readNextNPC(); + nextMicroPC[tid] = head_inst->readNextMicroPC(); // Increment the total number of non-speculative instructions // executed. @@ -905,12 +908,10 @@ DefaultCommit::commitInsts() } PC[tid] = nextPC[tid]; -#if ISA_HAS_DELAY_SLOT nextPC[tid] = nextNPC[tid]; nextNPC[tid] = nextNPC[tid] + sizeof(TheISA::MachInst); -#else - nextPC[tid] = nextPC[tid] + sizeof(TheISA::MachInst); -#endif + microPC[tid] = nextMicroPC[tid]; + nextMicroPC[tid] = microPC[tid] + 1; #if FULL_SYSTEM int count = 0; diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index b2b4645d2..59978a065 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -696,7 +696,7 @@ FullO3CPU::removeThread(unsigned tid) // Squash Throughout Pipeline InstSeqNum squash_seq_num = commit.rob->readHeadInst(tid)->seqNum; - fetch.squash(0, sizeof(TheISA::MachInst), squash_seq_num, tid); + fetch.squash(0, sizeof(TheISA::MachInst), 0, squash_seq_num, tid); decode.squash(tid); rename.squash(squash_seq_num, tid); iew.squash(tid); @@ -1150,6 +1150,20 @@ FullO3CPU::setPC(Addr new_PC,unsigned tid) commit.setPC(new_PC, tid); } +template +uint64_t +FullO3CPU::readMicroPC(unsigned tid) +{ + return commit.readMicroPC(tid); +} + +template +void +FullO3CPU::setMicroPC(Addr new_PC,unsigned tid) +{ + commit.setMicroPC(new_PC, tid); +} + template uint64_t FullO3CPU::readNextPC(unsigned tid) @@ -1178,6 +1192,20 @@ FullO3CPU::setNextNPC(uint64_t val,unsigned tid) commit.setNextNPC(val, tid); } +template +uint64_t +FullO3CPU::readNextMicroPC(unsigned tid) +{ + return commit.readNextMicroPC(tid); +} + +template +void +FullO3CPU::setNextMicroPC(Addr new_PC,unsigned tid) +{ + commit.setNextMicroPC(new_PC, tid); +} + template typename FullO3CPU::ListIt FullO3CPU::addInst(DynInstPtr &inst) diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 4b247e6e3..bff78bf9e 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -433,22 +433,34 @@ class FullO3CPU : public BaseO3CPU void setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid); /** Reads the commit PC of a specific thread. */ - uint64_t readPC(unsigned tid); + Addr readPC(unsigned tid); /** Sets the commit PC of a specific thread. */ void setPC(Addr new_PC, unsigned tid); + /** Reads the commit micro PC of a specific thread. */ + Addr readMicroPC(unsigned tid); + + /** Sets the commmit micro PC of a specific thread. */ + void setMicroPC(Addr new_microPC, unsigned tid); + /** Reads the next PC of a specific thread. */ - uint64_t readNextPC(unsigned tid); + Addr readNextPC(unsigned tid); /** Sets the next PC of a specific thread. */ - void setNextPC(uint64_t val, unsigned tid); + void setNextPC(Addr val, unsigned tid); /** Reads the next NPC of a specific thread. */ - uint64_t readNextNPC(unsigned tid); + Addr readNextNPC(unsigned tid); /** Sets the next NPC of a specific thread. */ - void setNextNPC(uint64_t val, unsigned tid); + void setNextNPC(Addr val, unsigned tid); + + /** Reads the commit next micro PC of a specific thread. */ + Addr readNextMicroPC(unsigned tid); + + /** Sets the commit next micro PC of a specific thread. */ + void setNextMicroPC(Addr val, unsigned tid); /** Function to add instruction onto the head of the list of the * instructions. Used when new instructions are fetched. diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh index c9d0a1885..ce6738456 100644 --- a/src/cpu/o3/decode_impl.hh +++ b/src/cpu/o3/decode_impl.hh @@ -273,6 +273,7 @@ DefaultDecode::squash(DynInstPtr &inst, unsigned tid) ///explicitly for ISAs with delay slots. toFetch->decodeInfo[tid].nextNPC = inst->branchTarget() + sizeof(TheISA::MachInst); + toFetch->decodeInfo[tid].nextMicroPC = inst->readMicroPC(); #if ISA_HAS_DELAY_SLOT toFetch->decodeInfo[tid].branchTaken = inst->readNextNPC() != (inst->readNextPC() + sizeof(TheISA::MachInst)); @@ -735,7 +736,8 @@ DefaultDecode::decodeInsts(unsigned tid) // a check at the end squash(inst, inst->threadNumber); Addr target = inst->branchTarget(); - inst->setPredTarg(target, target + sizeof(TheISA::MachInst)); + //The micro pc after an instruction level branch should be 0 + inst->setPredTarg(target, target + sizeof(TheISA::MachInst), 0); break; } } diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index bb0057e7c..7645a226c 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -227,7 +227,7 @@ class DefaultFetch * @param next_NPC Used for ISAs which use delay slots. * @return Whether or not a branch was predicted as taken. */ - bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, Addr &next_NPC); + bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, Addr &next_NPC, Addr &next_MicroPC); /** * Fetches the cache line that contains fetch_PC. Returns any @@ -242,12 +242,14 @@ class DefaultFetch bool fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid); /** Squashes a specific thread and resets the PC. */ - inline void doSquash(const Addr &new_PC, const Addr &new_NPC, unsigned tid); + inline void doSquash(const Addr &new_PC, const Addr &new_NPC, + const Addr &new_MicroPC, unsigned tid); /** Squashes a specific thread and resets the PC. Also tells the CPU to * remove any instructions between fetch and decode that should be sqaushed. */ void squashFromDecode(const Addr &new_PC, const Addr &new_NPC, + const Addr &new_MicroPC, const InstSeqNum &seq_num, unsigned tid); /** Checks if a thread is stalled. */ @@ -263,6 +265,7 @@ class DefaultFetch * squash should be the commit stage. */ void squash(const Addr &new_PC, const Addr &new_NPC, + const Addr &new_MicroPC, const InstSeqNum &seq_num, unsigned tid); /** Ticks the fetch stage, processing all inputs signals and fetching @@ -346,16 +349,12 @@ class DefaultFetch /** Per-thread fetch PC. */ Addr PC[Impl::MaxThreads]; + /** Per-thread fetch micro PC. */ + Addr microPC[Impl::MaxThreads]; + /** Per-thread next PC. */ Addr nextPC[Impl::MaxThreads]; - /** Per-thread next Next PC. - * This is not a real register but is used for - * architectures that use a branch-delay slot. - * (such as MIPS or Sparc) - */ - Addr nextNPC[Impl::MaxThreads]; - /** Memory request used to access cache. */ RequestPtr memReq[Impl::MaxThreads]; diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index 25498c7f3..d1f38e38b 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -312,7 +312,7 @@ DefaultFetch::initStage() for (int tid = 0; tid < numThreads; tid++) { PC[tid] = cpu->readPC(tid); nextPC[tid] = cpu->readNextPC(tid); - nextNPC[tid] = cpu->readNextNPC(tid); + microPC[tid] = cpu->readMicroPC(tid); } for (int tid=0; tid < numThreads; tid++) { @@ -439,11 +439,7 @@ DefaultFetch::takeOverFrom() stalls[i].commit = 0; PC[i] = cpu->readPC(i); nextPC[i] = cpu->readNextPC(i); -#if ISA_HAS_DELAY_SLOT - nextNPC[i] = cpu->readNextNPC(i); -#else - nextNPC[i] = nextPC[i] + sizeof(TheISA::MachInst); -#endif + microPC[i] = cpu->readMicroPC(i); fetchStatus[i] = Running; } numInst = 0; @@ -493,7 +489,7 @@ DefaultFetch::switchToInactive() template bool DefaultFetch::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, - Addr &next_NPC) + Addr &next_NPC, Addr &next_MicroPC) { // Do branch prediction check here. // A bit of a misnomer...next_PC is actually the current PC until @@ -501,13 +497,22 @@ DefaultFetch::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, bool predict_taken; if (!inst->isControl()) { - next_PC = next_NPC; - next_NPC = next_NPC + instSize; - inst->setPredTarg(next_PC, next_NPC); + if (inst->isMicroOp() && !inst->isLastMicroOp()) { + next_MicroPC++; + } else { + next_PC = next_NPC; + next_NPC = next_NPC + instSize; + next_MicroPC = 0; + } + inst->setPredTarg(next_PC, next_NPC, next_MicroPC); inst->setPredTaken(false); return false; } + //Assume for now that all control flow is to a different macroop which + //would reset the micro pc to 0. + next_MicroPC = 0; + int tid = inst->threadNumber; Addr pred_PC = next_PC; predict_taken = branchPred.predict(inst, pred_PC, tid); @@ -534,7 +539,7 @@ DefaultFetch::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, #endif /* DPRINTF(Fetch, "[tid:%i]: Branch predicted to go to %#x and then %#x.\n", tid, next_PC, next_NPC);*/ - inst->setPredTarg(next_PC, next_NPC); + inst->setPredTarg(next_PC, next_NPC, next_MicroPC); inst->setPredTaken(predict_taken); ++fetchedBranches; @@ -658,14 +663,14 @@ DefaultFetch::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid template inline void DefaultFetch::doSquash(const Addr &new_PC, - const Addr &new_NPC, unsigned tid) + const Addr &new_NPC, const Addr &new_microPC, unsigned tid) { DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x, NPC to: %#x.\n", tid, new_PC, new_NPC); PC[tid] = new_PC; nextPC[tid] = new_NPC; - nextNPC[tid] = new_NPC + instSize; + microPC[tid] = new_microPC; // Clear the icache miss if it's outstanding. if (fetchStatus[tid] == IcacheWaitResponse) { @@ -693,12 +698,12 @@ DefaultFetch::doSquash(const Addr &new_PC, template void DefaultFetch::squashFromDecode(const Addr &new_PC, const Addr &new_NPC, - const InstSeqNum &seq_num, - unsigned tid) + const Addr &new_MicroPC, + const InstSeqNum &seq_num, unsigned tid) { DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid); - doSquash(new_PC, new_NPC, tid); + doSquash(new_PC, new_NPC, new_MicroPC, tid); // Tell the CPU to remove any instructions that are in flight between // fetch and decode. @@ -774,11 +779,12 @@ DefaultFetch::updateFetchStatus() template void DefaultFetch::squash(const Addr &new_PC, const Addr &new_NPC, + const Addr &new_MicroPC, const InstSeqNum &seq_num, unsigned tid) { DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid); - doSquash(new_PC, new_NPC, tid); + doSquash(new_PC, new_NPC, new_MicroPC, tid); // Tell the CPU to remove any instructions that are not in the ROB. cpu->removeInstsNotInROB(tid); @@ -893,6 +899,7 @@ DefaultFetch::checkSignalsAndUpdate(unsigned tid) // In any case, squash. squash(fromCommit->commitInfo[tid].nextPC, fromCommit->commitInfo[tid].nextNPC, + fromCommit->commitInfo[tid].nextMicroPC, fromCommit->commitInfo[tid].doneSeqNum, tid); @@ -948,6 +955,7 @@ DefaultFetch::checkSignalsAndUpdate(unsigned tid) // Squash unless we're already squashing squashFromDecode(fromDecode->decodeInfo[tid].nextPC, fromDecode->decodeInfo[tid].nextNPC, + fromDecode->decodeInfo[tid].nextMicroPC, fromDecode->decodeInfo[tid].doneSeqNum, tid); @@ -1002,9 +1010,9 @@ DefaultFetch::fetch(bool &status_change) DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid); // The current PC. - Addr &fetch_PC = PC[tid]; - - Addr &fetch_NPC = nextPC[tid]; + Addr fetch_PC = PC[tid]; + Addr fetch_NPC = nextPC[tid]; + Addr fetch_MicroPC = microPC[tid]; // Fault code for memory access. Fault fault = NoFault; @@ -1063,6 +1071,7 @@ DefaultFetch::fetch(bool &status_change) Addr next_PC = fetch_PC; Addr next_NPC = fetch_NPC; + Addr next_MicroPC = fetch_MicroPC; InstSeqNum inst_seq; MachInst inst; @@ -1070,6 +1079,9 @@ DefaultFetch::fetch(bool &status_change) // @todo: Fix this hack. unsigned offset = (fetch_PC & cacheBlkMask) & ~3; + StaticInstPtr staticInst = NULL; + StaticInstPtr macroop = NULL; + if (fault == NoFault) { // If the read of the first instruction was successful, then grab the // instructions from the rest of the cache line and put them into the @@ -1104,19 +1116,29 @@ DefaultFetch::fetch(bool &status_change) // Make sure this is a valid index. assert(offset <= cacheBlkSize - instSize); - // Get the instruction from the array of the cache line. - inst = TheISA::gtoh(*reinterpret_cast - (&cacheData[tid][offset])); + if (!macroop) { + // Get the instruction from the array of the cache line. + inst = TheISA::gtoh(*reinterpret_cast + (&cacheData[tid][offset])); - predecoder.setTC(cpu->thread[tid]->getTC()); - predecoder.moreBytes(fetch_PC, 0, inst); + predecoder.setTC(cpu->thread[tid]->getTC()); + predecoder.moreBytes(fetch_PC, 0, inst); - ext_inst = predecoder.getExtMachInst(); + ext_inst = predecoder.getExtMachInst(); + staticInst = StaticInstPtr(ext_inst); + if (staticInst->isMacroOp()) + macroop = staticInst; + } + if (macroop) { + staticInst = macroop->fetchMicroOp(fetch_MicroPC); + if (staticInst->isLastMicroOp()) + macroop = NULL; + } // Create a new DynInst from the instruction fetched. - DynInstPtr instruction = new DynInst(ext_inst, - fetch_PC, fetch_NPC, - next_PC, next_NPC, + DynInstPtr instruction = new DynInst(staticInst, + fetch_PC, fetch_NPC, fetch_MicroPC, + next_PC, next_NPC, next_MicroPC, inst_seq, cpu); instruction->setTid(tid); @@ -1139,7 +1161,7 @@ DefaultFetch::fetch(bool &status_change) instruction->readPC()); ///FIXME This needs to be more robust in dealing with delay slots - lookupAndUpdateNextPC(instruction, next_PC, next_NPC); + lookupAndUpdateNextPC(instruction, next_PC, next_NPC, next_MicroPC); predicted_branch |= (next_PC != fetch_NPC); // Add instruction to the CPU's list of instructions. @@ -1157,6 +1179,7 @@ DefaultFetch::fetch(bool &status_change) // Move to the next instruction, unless we have a branch. fetch_PC = next_PC; fetch_NPC = next_NPC; + fetch_MicroPC = next_MicroPC; if (instruction->isQuiesce()) { DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!", @@ -1167,7 +1190,8 @@ DefaultFetch::fetch(bool &status_change) break; } - offset += instSize; + if (!macroop) + offset += instSize; } if (offset >= cacheBlkSize) { @@ -1191,7 +1215,7 @@ DefaultFetch::fetch(bool &status_change) if (fault == NoFault) { PC[tid] = next_PC; nextPC[tid] = next_NPC; - nextNPC[tid] = next_NPC + instSize; + microPC[tid] = next_MicroPC; DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, next_PC); } else { // We shouldn't be in an icache miss and also have a fault (an ITB @@ -1210,8 +1234,9 @@ DefaultFetch::fetch(bool &status_change) // We will use a nop in order to carry the fault. ext_inst = TheISA::NoopMachInst; + StaticInstPtr staticInst = new StaticInst(ext_inst); // Create a new DynInst from the dummy nop. - DynInstPtr instruction = new DynInst(ext_inst, + DynInstPtr instruction = new DynInst(staticInst, fetch_PC, fetch_NPC, next_PC, next_NPC, inst_seq, cpu); diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 050785818..399c44909 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -454,6 +454,7 @@ DefaultIEW::squashDueToBranch(DynInstPtr &inst, unsigned tid) #endif toCommit->nextPC[tid] = inst->readNextPC(); toCommit->nextNPC[tid] = inst->readNextNPC(); + toCommit->nextMicroPC[tid] = inst->readNextMicroPC(); toCommit->includeSquashInst[tid] = false; diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index 6e7180b1e..d78de2c87 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -963,6 +963,7 @@ DefaultRename::renameSrcRegs(DynInstPtr &inst,unsigned tid) // Floating point and Miscellaneous registers need their indexes // adjusted to account for the expanded number of flattened int regs. flat_src_reg = src_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs; + DPRINTF(Rename, "Adjusting reg index from %d to %d.\n", src_reg, flat_src_reg); } inst->flattenSrcReg(src_idx, flat_src_reg); @@ -979,9 +980,11 @@ DefaultRename::renameSrcRegs(DynInstPtr &inst,unsigned tid) // See if the register is ready or not. if (scoreboard->getReg(renamed_reg) == true) { - DPRINTF(Rename, "[tid:%u]: Register is ready.\n", tid); + DPRINTF(Rename, "[tid:%u]: Register %d is ready.\n", tid, renamed_reg); inst->markSrcRegReady(src_idx); + } else { + DPRINTF(Rename, "[tid:%u]: Register %d is not ready.\n", tid, renamed_reg); } ++renameRenameLookups; @@ -1008,6 +1011,7 @@ DefaultRename::renameDestRegs(DynInstPtr &inst,unsigned tid) // Floating point and Miscellaneous registers need their indexes // adjusted to account for the expanded number of flattened int regs. flat_dest_reg = dest_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs; + DPRINTF(Rename, "Adjusting reg index from %d to %d.\n", dest_reg, flat_dest_reg); } inst->flattenDestReg(dest_idx, flat_dest_reg); diff --git a/src/cpu/o3/sparc/dyn_inst.hh b/src/cpu/o3/sparc/dyn_inst.hh index 72242b161..a7ab6cd79 100644 --- a/src/cpu/o3/sparc/dyn_inst.hh +++ b/src/cpu/o3/sparc/dyn_inst.hh @@ -56,8 +56,14 @@ class SparcDynInst : public BaseDynInst public: /** BaseDynInst constructor given a binary instruction. */ - SparcDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, InstSeqNum seq_num, O3CPU *cpu); + SparcDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); + + /** BaseDynInst constructor given a binary instruction. */ + SparcDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); /** BaseDynInst constructor given a static inst pointer. */ SparcDynInst(StaticInstPtr &_staticInst); diff --git a/src/cpu/o3/sparc/dyn_inst_impl.hh b/src/cpu/o3/sparc/dyn_inst_impl.hh index c4d30b6f4..6bfe97717 100644 --- a/src/cpu/o3/sparc/dyn_inst_impl.hh +++ b/src/cpu/o3/sparc/dyn_inst_impl.hh @@ -31,10 +31,23 @@ #include "cpu/o3/sparc/dyn_inst.hh" template -SparcDynInst::SparcDynInst(TheISA::ExtMachInst inst, - Addr PC, Addr NPC, Addr Pred_PC, Addr Pred_NPC, +SparcDynInst::SparcDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu) - : BaseDynInst(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu) + : BaseDynInst(staticInst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) +{ + initVars(); +} + +template +SparcDynInst::SparcDynInst(TheISA::ExtMachInst inst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu) + : BaseDynInst(inst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) { initVars(); } From 308b2f0ce3215eaaed69da937555008f9ed36835 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Sun, 15 Apr 2007 21:51:05 +0000 Subject: [PATCH 6/7] Add extra constructors to Alpha and MIPS --HG-- extra : convert_revision : 26ea87bfe9e5c27134eb9a15bf9e4629afae6c69 --- src/cpu/o3/alpha/dyn_inst.hh | 9 +++++++-- src/cpu/o3/alpha/dyn_inst_impl.hh | 19 +++++++++++++++++-- src/cpu/o3/mips/dyn_inst.hh | 10 ++++++++-- src/cpu/o3/mips/dyn_inst_impl.hh | 20 ++++++++++++++++---- 4 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/cpu/o3/alpha/dyn_inst.hh b/src/cpu/o3/alpha/dyn_inst.hh index 20759d849..a6fb7b885 100644 --- a/src/cpu/o3/alpha/dyn_inst.hh +++ b/src/cpu/o3/alpha/dyn_inst.hh @@ -73,8 +73,13 @@ class AlphaDynInst : public BaseDynInst public: /** BaseDynInst constructor given a binary instruction. */ - AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, + AlphaDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); + + /** BaseDynInst constructor given a binary instruction. */ + AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu); /** BaseDynInst constructor given a static inst pointer. */ diff --git a/src/cpu/o3/alpha/dyn_inst_impl.hh b/src/cpu/o3/alpha/dyn_inst_impl.hh index fdce1ade5..6dfe0ccdd 100644 --- a/src/cpu/o3/alpha/dyn_inst_impl.hh +++ b/src/cpu/o3/alpha/dyn_inst_impl.hh @@ -31,10 +31,25 @@ #include "cpu/o3/alpha/dyn_inst.hh" template -AlphaDynInst::AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, +AlphaDynInst::AlphaDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, Addr Pred_PC, Addr Pred_NPC, + Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu) - : BaseDynInst(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu) + : BaseDynInst(staticInst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) +{ + initVars(); +} + +template +AlphaDynInst::AlphaDynInst(ExtMachInst inst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, + Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu) + : BaseDynInst(inst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) { initVars(); } diff --git a/src/cpu/o3/mips/dyn_inst.hh b/src/cpu/o3/mips/dyn_inst.hh index 366b4bb23..cf78c0941 100755 --- a/src/cpu/o3/mips/dyn_inst.hh +++ b/src/cpu/o3/mips/dyn_inst.hh @@ -69,10 +69,16 @@ class MipsDynInst : public BaseDynInst }; public: + /** BaseDynInst constructor given a binary instruction. */ + MipsDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu); + /** BaseDynInst constructor given a binary instruction. */ MipsDynInst(ExtMachInst inst, - Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu); /** BaseDynInst constructor given a static inst pointer. */ diff --git a/src/cpu/o3/mips/dyn_inst_impl.hh b/src/cpu/o3/mips/dyn_inst_impl.hh index c0f9ae771..7e8697b32 100755 --- a/src/cpu/o3/mips/dyn_inst_impl.hh +++ b/src/cpu/o3/mips/dyn_inst_impl.hh @@ -31,11 +31,23 @@ #include "cpu/o3/mips/dyn_inst.hh" template -MipsDynInst::MipsDynInst(ExtMachInst inst, - Addr PC, Addr NPC, - Addr Pred_PC, Addr Pred_NPC, +MipsDynInst::MipsDynInst(StaticInstPtr staticInst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, InstSeqNum seq_num, O3CPU *cpu) - : BaseDynInst(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu) + : BaseDynInst(staticInst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) +{ + initVars(); +} + +template +MipsDynInst::MipsDynInst(ExtMachInst inst, + Addr PC, Addr NPC, Addr microPC, + Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC, + InstSeqNum seq_num, O3CPU *cpu) + : BaseDynInst(inst, PC, NPC, microPC, + Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu) { initVars(); } From 8248af53b19a633ae6d9aa8cd6b5a12cfa3b1644 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Sun, 15 Apr 2007 21:52:38 +0000 Subject: [PATCH 7/7] Make an inner loop which pulls microops out of macroops. These aren't checked for control flow because we can pull out microops until we run out of buffer. This prevents microops from being interpretted as branches because the pc doesn't become npc. --HG-- extra : convert_revision : 9fff7c6c32900692bbc567ecb75701c9c73da259 --- src/cpu/o3/fetch_impl.hh | 127 ++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index d1f38e38b..3ae7bc402 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -1094,11 +1094,9 @@ DefaultFetch::fetch(bool &status_change) // ended this fetch block. bool predicted_branch = false; - for (; - offset < cacheBlkSize && - numInst < fetchWidth && - !predicted_branch; - ++numInst) { + while (offset < cacheBlkSize && + numInst < fetchWidth && + !predicted_branch) { // If we're branching after this instruction, quite fetching // from the same block then. @@ -1109,10 +1107,6 @@ DefaultFetch::fetch(bool &status_change) fetch_PC, fetch_NPC); } - - // Get a sequence number. - inst_seq = cpu->getAndIncrementInstSeq(); - // Make sure this is a valid index. assert(offset <= cacheBlkSize - instSize); @@ -1129,80 +1123,87 @@ DefaultFetch::fetch(bool &status_change) if (staticInst->isMacroOp()) macroop = staticInst; } - if (macroop) { - staticInst = macroop->fetchMicroOp(fetch_MicroPC); - if (staticInst->isLastMicroOp()) - macroop = NULL; - } + do { + if (macroop) { + staticInst = macroop->fetchMicroOp(fetch_MicroPC); + if (staticInst->isLastMicroOp()) + macroop = NULL; + } - // Create a new DynInst from the instruction fetched. - DynInstPtr instruction = new DynInst(staticInst, - fetch_PC, fetch_NPC, fetch_MicroPC, - next_PC, next_NPC, next_MicroPC, - inst_seq, cpu); - instruction->setTid(tid); + // Get a sequence number. + inst_seq = cpu->getAndIncrementInstSeq(); - instruction->setASID(tid); + // Create a new DynInst from the instruction fetched. + DynInstPtr instruction = new DynInst(staticInst, + fetch_PC, fetch_NPC, fetch_MicroPC, + next_PC, next_NPC, next_MicroPC, + inst_seq, cpu); + instruction->setTid(tid); - instruction->setThreadState(cpu->thread[tid]); + instruction->setASID(tid); - DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created " - "[sn:%lli]\n", - tid, instruction->readPC(), inst_seq); + instruction->setThreadState(cpu->thread[tid]); - //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst); + DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created " + "[sn:%lli]\n", + tid, instruction->readPC(), inst_seq); - DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n", - tid, instruction->staticInst->disassemble(fetch_PC)); + //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst); - instruction->traceData = - Trace::getInstRecord(curTick, cpu->tcBase(tid), - instruction->staticInst, - instruction->readPC()); + DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n", + tid, instruction->staticInst->disassemble(fetch_PC)); - ///FIXME This needs to be more robust in dealing with delay slots - lookupAndUpdateNextPC(instruction, next_PC, next_NPC, next_MicroPC); - predicted_branch |= (next_PC != fetch_NPC); + instruction->traceData = + Trace::getInstRecord(curTick, cpu->tcBase(tid), + instruction->staticInst, + instruction->readPC()); - // Add instruction to the CPU's list of instructions. - instruction->setInstListIt(cpu->addInst(instruction)); + ///FIXME This needs to be more robust in dealing with delay slots + predicted_branch |= + lookupAndUpdateNextPC(instruction, next_PC, next_NPC, next_MicroPC); - // Write the instruction to the first slot in the queue - // that heads to decode. - toDecode->insts[numInst] = instruction; + // Add instruction to the CPU's list of instructions. + instruction->setInstListIt(cpu->addInst(instruction)); - toDecode->size++; + // Write the instruction to the first slot in the queue + // that heads to decode. + toDecode->insts[numInst] = instruction; - // Increment stat of fetched instructions. - ++fetchedInsts; + toDecode->size++; - // Move to the next instruction, unless we have a branch. - fetch_PC = next_PC; - fetch_NPC = next_NPC; - fetch_MicroPC = next_MicroPC; + // Increment stat of fetched instructions. + ++fetchedInsts; + + // Move to the next instruction, unless we have a branch. + fetch_PC = next_PC; + fetch_NPC = next_NPC; + fetch_MicroPC = next_MicroPC; + + if (instruction->isQuiesce()) { + DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!", + curTick); + fetchStatus[tid] = QuiescePending; + ++numInst; + status_change = true; + break; + } - if (instruction->isQuiesce()) { - DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!", - curTick); - fetchStatus[tid] = QuiescePending; ++numInst; - status_change = true; - break; - } - - if (!macroop) - offset += instSize; + } while (staticInst->isMicroOp() && + !staticInst->isLastMicroOp() && + numInst < fetchWidth); + offset += instSize; } - if (offset >= cacheBlkSize) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " - "block.\n", tid); + if (predicted_branch) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " + "instruction encountered.\n", tid); } else if (numInst >= fetchWidth) { DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " "for this cycle.\n", tid); - } else if (predicted_branch) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch " - "instruction encountered.\n", tid); + } else if (offset >= cacheBlkSize) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " + "block.\n", tid); } }