Merge zizzer.eecs.umich.edu:/z/m5/Bitkeeper/newmem

into zizzer.eecs.umich.edu:/.automount/wexford/x/gblack/m5/newmem-o3-spec --HG-- extra : convert_revision : d18cce378fe3390c6e708945b9ea7c76c2d20a81
2007-04-17 08:56:59 -04:00 · 2007-04-17 08:56:59 -04:00 · dde2b11ae6
commit dde2b11ae6
parent 68221b708c 8248af53b1
25 changed files with 560 additions and 605 deletions
--- a/src/arch/sparc/isa/operands.isa
+++ b/src/arch/sparc/isa/operands.isa
@ -187,7 +187,7 @@ def operands {{
    'Hver':		('ControlReg', 'udw', 'MISCREG_HVER', None, 74),
    'StrandStsReg':	('ControlReg', 'udw', 'MISCREG_STRAND_STS_REG', None, 75),

-    'Fsr':		('ControlReg', 'udw', 'MISCREG_FSR', None, 80),
+    'Fsr':		('ControlReg', 'udw', 'MISCREG_FSR', (None, None, ['IsSerializeAfter','IsSerializing','IsNonSpeculative']), 80),
    # Mem gets a large number so it's always last
    'Mem': 		('Mem', 'udw', None, ('IsMemRef', 'IsLoad', 'IsStore'), 100)

--- a/src/arch/sparc/types.hh
+++ b/src/arch/sparc/types.hh
@ -59,7 +59,7 @@ namespace SparcISA

    typedef int RegContextVal;

-    typedef uint8_t RegIndex;
+    typedef uint16_t RegIndex;
 }

 #endif
--- a/src/cpu/base_dyn_inst.hh
+++ b/src/cpu/base_dyn_inst.hh
@ -209,6 +209,9 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** PC of this instruction. */
    Addr PC;

+    /** Micro PC of this instruction. */
+    Addr microPC;
+
  protected:
    /** Next non-speculative PC.  It is not filled in at fetch, but rather
     *  once the target of the branch is truly known (either decode or
@ -219,12 +222,18 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** Next non-speculative NPC. Target PC for Mips or Sparc. */
    Addr nextNPC;

+    /** Next non-speculative micro PC. */
+    Addr nextMicroPC;
+
    /** Predicted next PC. */
    Addr predPC;

    /** Predicted next NPC. */
    Addr predNPC;

+    /** Predicted next microPC */
+    Addr predMicroPC;
+
    /** If this is a branch that was predicted taken */
    bool predTaken;

@ -340,6 +349,17 @@ class BaseDynInst : public FastAlloc, public RefCounted
    {
        _flatDestRegIdx[idx] = flattened_dest;
    }
+    /** BaseDynInst constructor given a binary instruction.
+     *  @param staticInst A StaticInstPtr to the underlying instruction.
+     *  @param PC The PC of the instruction.
+     *  @param pred_PC The predicted next PC.
+     *  @param pred_NPC The predicted next NPC.
+     *  @param seq_num The sequence number of the instruction.
+     *  @param cpu Pointer to the instruction's CPU.
+     */
+    BaseDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC,
+            Addr pred_PC, Addr pred_NPC, Addr pred_MicroPC,
+            InstSeqNum seq_num, ImplCPU *cpu);

    /** BaseDynInst constructor given a binary instruction.
     *  @param inst The binary instruction.
@ -349,8 +369,8 @@ class BaseDynInst : public FastAlloc, public RefCounted
     *  @param seq_num The sequence number of the instruction.
     *  @param cpu Pointer to the instruction's CPU.
     */
-    BaseDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC,
-            Addr pred_PC, Addr pred_NPC,
+    BaseDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, Addr microPC,
+            Addr pred_PC, Addr pred_NPC, Addr pred_MicroPC,
            InstSeqNum seq_num, ImplCPU *cpu);

    /** BaseDynInst constructor given a StaticInst pointer.
@ -402,11 +422,18 @@ class BaseDynInst : public FastAlloc, public RefCounted
 #endif
    }

+    Addr readNextMicroPC()
+    {
+        return nextMicroPC;
+    }
+
    /** Set the predicted target of this current instruction. */
-    void setPredTarg(Addr predicted_PC, Addr predicted_NPC)
+    void setPredTarg(Addr predicted_PC, Addr predicted_NPC,
+            Addr predicted_MicroPC)
    {
        predPC = predicted_PC;
        predNPC = predicted_NPC;
+        predMicroPC = predicted_MicroPC;
    }

    /** Returns the predicted PC immediately after the branch. */
@ -415,6 +442,9 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** Returns the predicted PC two instructions after the branch */
    Addr readPredNPC() { return predNPC; }

+    /** Returns the predicted micro PC after the branch */
+    Addr readPredMicroPC() { return predMicroPC; }
+
    /** Returns whether the instruction was predicted taken or not. */
    bool readPredTaken()
    {
@ -430,7 +460,8 @@ class BaseDynInst : public FastAlloc, public RefCounted
    bool mispredicted()
    {
        return readPredPC() != readNextPC() ||
-            readPredNPC() != readNextNPC();
+            readPredNPC() != readNextNPC() ||
+            readPredMicroPC() != readNextMicroPC();
    }

    //
@ -467,6 +498,12 @@ class BaseDynInst : public FastAlloc, public RefCounted
    bool isQuiesce() const { return staticInst->isQuiesce(); }
    bool isIprAccess() const { return staticInst->isIprAccess(); }
    bool isUnverifiable() const { return staticInst->isUnverifiable(); }
+    bool isMacroOp() const { return staticInst->isMacroOp(); }
+    bool isMicroOp() const { return staticInst->isMicroOp(); }
+    bool isDelayedCommit() const { return staticInst->isDelayedCommit(); }
+    bool isLastMicroOp() const { return staticInst->isLastMicroOp(); }
+    bool isFirstMicroOp() const { return staticInst->isFirstMicroOp(); }
+    bool isMicroBranch() const { return staticInst->isMicroBranch(); }

    /** Temporarily sets this instruction as a serialize before instruction. */
    void setSerializeBefore() { status.set(SerializeBefore); }
@ -700,16 +737,26 @@ class BaseDynInst : public FastAlloc, public RefCounted
    /** Read the PC of this instruction. */
    const Addr readPC() const { return PC; }

+    /**Read the micro PC of this instruction. */
+    const Addr readMicroPC() const { return microPC; }
+
    /** Set the next PC of this instruction (its actual target). */
-    void setNextPC(uint64_t val)
+    void setNextPC(Addr val)
    {
        nextPC = val;
    }

    /** Set the next NPC of this instruction (the target in Mips or Sparc).*/
-    void setNextNPC(uint64_t val)
+    void setNextNPC(Addr val)
    {
+#if ISA_HAS_DELAY_SLOT
        nextNPC = val;
+#endif
+    }
+
+    void setNextMicroPC(Addr val)
+    {
+        nextMicroPC = val;
    }

    /** Sets the ASID. */
--- a/src/cpu/base_dyn_inst_impl.hh
+++ b/src/cpu/base_dyn_inst_impl.hh
@ -62,19 +62,66 @@ my_hash_t thishash;
 #endif

 template <class Impl>
-BaseDynInst<Impl>::BaseDynInst(TheISA::ExtMachInst machInst,
+BaseDynInst<Impl>::BaseDynInst(StaticInstPtr _staticInst,
                               Addr inst_PC, Addr inst_NPC,
+                               Addr inst_MicroPC,
                               Addr pred_PC, Addr pred_NPC,
+                               Addr pred_MicroPC,
                               InstSeqNum seq_num, ImplCPU *cpu)
-  : staticInst(machInst), traceData(NULL), cpu(cpu)
+  : staticInst(_staticInst), traceData(NULL), cpu(cpu)
 {
    seqNum = seq_num;

+    bool nextIsMicro =
+        staticInst->isMicroOp() && !staticInst->isLastMicroOp();
+
    PC = inst_PC;
-    nextPC = inst_NPC;
-    nextNPC = nextPC + sizeof(TheISA::MachInst);
+    microPC = inst_MicroPC;
+    if (nextIsMicro) {
+        nextPC = inst_PC;
+        nextNPC = inst_NPC;
+        nextMicroPC = microPC + 1;
+    } else {
+        nextPC = inst_NPC;
+        nextNPC = nextPC + sizeof(TheISA::MachInst);
+        nextMicroPC = 0;
+    }
    predPC = pred_PC;
    predNPC = pred_NPC;
+    predMicroPC = pred_MicroPC;
+    predTaken = false;
+
+    initVars();
+}
+
+template <class Impl>
+BaseDynInst<Impl>::BaseDynInst(TheISA::ExtMachInst inst,
+                               Addr inst_PC, Addr inst_NPC,
+                               Addr inst_MicroPC,
+                               Addr pred_PC, Addr pred_NPC,
+                               Addr pred_MicroPC,
+                               InstSeqNum seq_num, ImplCPU *cpu)
+  : staticInst(inst), traceData(NULL), cpu(cpu)
+{
+    seqNum = seq_num;
+
+    bool nextIsMicro =
+        staticInst->isMicroOp() && !staticInst->isLastMicroOp();
+
+    PC = inst_PC;
+    microPC = inst_MicroPC;
+    if (nextIsMicro) {
+        nextPC = inst_PC;
+        nextNPC = inst_NPC;
+        nextMicroPC = microPC + 1;
+    } else {
+        nextPC = inst_NPC;
+        nextNPC = nextPC + sizeof(TheISA::MachInst);
+        nextMicroPC = 0;
+    }
+    predPC = pred_PC;
+    predNPC = pred_NPC;
+    predMicroPC = pred_MicroPC;
    predTaken = false;

    initVars();
--- a/src/cpu/o3/alpha/dyn_inst.hh
+++ b/src/cpu/o3/alpha/dyn_inst.hh
@ -73,8 +73,13 @@ class AlphaDynInst : public BaseDynInst<Impl>

  public:
    /** BaseDynInst constructor given a binary instruction. */
-    AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC,
-                 Addr Pred_PC, Addr Pred_NPC,
+    AlphaDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC,
+                 Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
+                 InstSeqNum seq_num, O3CPU *cpu);
+
+    /** BaseDynInst constructor given a binary instruction. */
+    AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC, Addr microPC,
+                 Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
                 InstSeqNum seq_num, O3CPU *cpu);

    /** BaseDynInst constructor given a static inst pointer. */
--- a/src/cpu/o3/alpha/dyn_inst_impl.hh
+++ b/src/cpu/o3/alpha/dyn_inst_impl.hh
@ -31,10 +31,25 @@
 #include "cpu/o3/alpha/dyn_inst.hh"

 template <class Impl>
-AlphaDynInst<Impl>::AlphaDynInst(ExtMachInst inst, Addr PC, Addr NPC,
+AlphaDynInst<Impl>::AlphaDynInst(StaticInstPtr staticInst,
+                                 Addr PC, Addr NPC, Addr microPC,
                                 Addr Pred_PC, Addr Pred_NPC,
+                                 Addr Pred_MicroPC,
                                 InstSeqNum seq_num, O3CPU *cpu)
-    : BaseDynInst<Impl>(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu)
+    : BaseDynInst<Impl>(staticInst, PC, NPC, microPC,
+            Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu)
+{
+    initVars();
+}
+
+template <class Impl>
+AlphaDynInst<Impl>::AlphaDynInst(ExtMachInst inst,
+                                 Addr PC, Addr NPC, Addr microPC,
+                                 Addr Pred_PC, Addr Pred_NPC,
+                                 Addr Pred_MicroPC,
+                                 InstSeqNum seq_num, O3CPU *cpu)
+    : BaseDynInst<Impl>(inst, PC, NPC, microPC,
+            Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu)
 {
    initVars();
 }
--- a/src/cpu/o3/comm.hh
+++ b/src/cpu/o3/comm.hh
@ -87,10 +87,10 @@ struct DefaultIEWDefaultCommit {
    bool squash[Impl::MaxThreads];
    bool branchMispredict[Impl::MaxThreads];
    bool branchTaken[Impl::MaxThreads];
-    bool squashDelaySlot[Impl::MaxThreads];
-    uint64_t mispredPC[Impl::MaxThreads];
-    uint64_t nextPC[Impl::MaxThreads];
-    uint64_t nextNPC[Impl::MaxThreads];
+    Addr mispredPC[Impl::MaxThreads];
+    Addr nextPC[Impl::MaxThreads];
+    Addr nextNPC[Impl::MaxThreads];
+    Addr nextMicroPC[Impl::MaxThreads];
    InstSeqNum squashedSeqNum[Impl::MaxThreads];

    bool includeSquashInst[Impl::MaxThreads];
@ -114,15 +114,15 @@ struct TimeBufStruct {
        uint64_t branchAddr;

        InstSeqNum doneSeqNum;
-        InstSeqNum bdelayDoneSeqNum;

        // @todo: Might want to package this kind of branch stuff into a single
        // struct as it is used pretty frequently.
        bool branchMispredict;
        bool branchTaken;
-        uint64_t mispredPC;
-        uint64_t nextPC;
-        uint64_t nextNPC;
+        Addr mispredPC;
+        Addr nextPC;
+        Addr nextNPC;
+        Addr nextMicroPC;

        unsigned branchCount;
    };
@ -160,18 +160,16 @@ struct TimeBufStruct {

        bool branchMispredict;
        bool branchTaken;
-        uint64_t mispredPC;
-        uint64_t nextPC;
-        uint64_t nextNPC;
+        Addr mispredPC;
+        Addr nextPC;
+        Addr nextNPC;
+        Addr nextMicroPC;

        // Represents the instruction that has either been retired or
        // squashed.  Similar to having a single bus that broadcasts the
        // retired or squashed sequence number.
        InstSeqNum doneSeqNum;

-        InstSeqNum bdelayDoneSeqNum;
-        bool squashDelaySlot;
-
        //Just in case we want to do a commit/squash on a cycle
        //(necessary for multiple ROBs?)
        bool commitInsts;
--- a/src/cpu/o3/commit.hh
+++ b/src/cpu/o3/commit.hh
@ -279,25 +279,37 @@ class DefaultCommit
    /** Returns the PC of the head instruction of the ROB.
     * @todo: Probably remove this function as it returns only thread 0.
     */
-    uint64_t readPC() { return PC[0]; }
+    Addr readPC() { return PC[0]; }

    /** Returns the PC of a specific thread. */
-    uint64_t readPC(unsigned tid) { return PC[tid]; }
+    Addr readPC(unsigned tid) { return PC[tid]; }

    /** Sets the PC of a specific thread. */
-    void setPC(uint64_t val, unsigned tid) { PC[tid] = val; }
+    void setPC(Addr val, unsigned tid) { PC[tid] = val; }
+
+    /** Reads the micro PC of a specific thread. */
+    Addr readMicroPC(unsigned tid) { return microPC[tid]; }
+
+    /** Sets the micro PC of a specific thread */
+    void setMicroPC(Addr val, unsigned tid) { microPC[tid] = val; }

    /** Reads the next PC of a specific thread. */
-    uint64_t readNextPC(unsigned tid) { return nextPC[tid]; }
+    Addr readNextPC(unsigned tid) { return nextPC[tid]; }

    /** Sets the next PC of a specific thread. */
-    void setNextPC(uint64_t val, unsigned tid) { nextPC[tid] = val; }
+    void setNextPC(Addr val, unsigned tid) { nextPC[tid] = val; }

    /** Reads the next NPC of a specific thread. */
-    uint64_t readNextNPC(unsigned tid) { return nextNPC[tid]; }
+    Addr readNextNPC(unsigned tid) { return nextNPC[tid]; }

    /** Sets the next NPC of a specific thread. */
-    void setNextNPC(uint64_t val, unsigned tid) { nextNPC[tid] = val; }
+    void setNextNPC(Addr val, unsigned tid) { nextNPC[tid] = val; }
+
+    /** Reads the micro PC of a specific thread. */
+    Addr readNextMicroPC(unsigned tid) { return nextMicroPC[tid]; }
+
+    /** Sets the micro PC of a specific thread */
+    void setNextMicroPC(Addr val, unsigned tid) { nextMicroPC[tid] = val; }

  private:
    /** Time buffer interface. */
@ -402,12 +414,20 @@ class DefaultCommit
     */
    Addr PC[Impl::MaxThreads];

+    /** The commit micro PC of each thread.  Refers to the instruction that
+     * is currently being processed/committed.
+     */
+    Addr microPC[Impl::MaxThreads];
+
    /** The next PC of each thread. */
    Addr nextPC[Impl::MaxThreads];

    /** The next NPC of each thread. */
    Addr nextNPC[Impl::MaxThreads];

+    /** The next micro PC of each thread. */
+    Addr nextMicroPC[Impl::MaxThreads];
+
    /** The sequence number of the youngest valid instruction in the ROB. */
    InstSeqNum youngestSeqNum[Impl::MaxThreads];

--- a/src/cpu/o3/commit_impl.hh
+++ b/src/cpu/o3/commit_impl.hh
@ -124,7 +124,7 @@ DefaultCommit<Impl>::DefaultCommit(O3CPU *_cpu, Params *params)
        committedStores[i] = false;
        trapSquash[i] = false;
        tcSquash[i] = false;
-        PC[i] = nextPC[i] = nextNPC[i] = 0;
+        microPC[i] = nextMicroPC[i] = PC[i] = nextPC[i] = nextNPC[i] = 0;
    }
 #if FULL_SYSTEM
    interrupt = NoFault;
@ -508,6 +508,7 @@ DefaultCommit<Impl>::squashAll(unsigned tid)

    toIEW->commitInfo[tid].nextPC = PC[tid];
    toIEW->commitInfo[tid].nextNPC = nextPC[tid];
+    toIEW->commitInfo[tid].nextMicroPC = nextMicroPC[tid];
 }

 template <class Impl>
@ -741,38 +742,15 @@ DefaultCommit<Impl>::commit()
            // then use one older sequence number.
            InstSeqNum squashed_inst = fromIEW->squashedSeqNum[tid];

-#if ISA_HAS_DELAY_SLOT
-            InstSeqNum bdelay_done_seq_num = squashed_inst;
-            bool squash_bdelay_slot = fromIEW->squashDelaySlot[tid];
-            bool branchMispredict = fromIEW->branchMispredict[tid];
-
-            // Squashing/not squashing the branch delay slot only makes
-            // sense when you're squashing from a branch, ie from a branch
-            // mispredict.
-            if (branchMispredict && !squash_bdelay_slot) {
-                bdelay_done_seq_num++;
-            }
-#endif
-
            if (fromIEW->includeSquashInst[tid] == true) {
                squashed_inst--;
-#if ISA_HAS_DELAY_SLOT
-                bdelay_done_seq_num--;
-#endif
            }

            // All younger instructions will be squashed. Set the sequence
            // number as the youngest instruction in the ROB.
            youngestSeqNum[tid] = squashed_inst;

-#if ISA_HAS_DELAY_SLOT
-            rob->squash(bdelay_done_seq_num, tid);
-            toIEW->commitInfo[tid].squashDelaySlot = squash_bdelay_slot;
-            toIEW->commitInfo[tid].bdelayDoneSeqNum = bdelay_done_seq_num;
-#else
            rob->squash(squashed_inst, tid);
-            toIEW->commitInfo[tid].squashDelaySlot = true;
-#endif
            changedROBNumEntries[tid] = true;

            toIEW->commitInfo[tid].doneSeqNum = squashed_inst;
@ -791,6 +769,7 @@ DefaultCommit<Impl>::commit()

            toIEW->commitInfo[tid].nextPC = fromIEW->nextPC[tid];
            toIEW->commitInfo[tid].nextNPC = fromIEW->nextNPC[tid];
+            toIEW->commitInfo[tid].nextMicroPC = fromIEW->nextMicroPC[tid];

            toIEW->commitInfo[tid].mispredPC = fromIEW->mispredPC[tid];

@ -809,10 +788,6 @@ DefaultCommit<Impl>::commit()

        // Try to commit any instructions.
        commitInsts();
-    } else {
-#if ISA_HAS_DELAY_SLOT
-        skidInsert();
-#endif
    }

    //Check for any activity
@ -904,6 +879,7 @@ DefaultCommit<Impl>::commitInsts()
            PC[tid] = head_inst->readPC();
            nextPC[tid] = head_inst->readNextPC();
            nextNPC[tid] = head_inst->readNextNPC();
+            nextMicroPC[tid] = head_inst->readNextMicroPC();

            // Increment the total number of non-speculative instructions
            // executed.
@ -932,12 +908,10 @@ DefaultCommit<Impl>::commitInsts()
                }

                PC[tid] = nextPC[tid];
-#if ISA_HAS_DELAY_SLOT
                nextPC[tid] = nextNPC[tid];
                nextNPC[tid] = nextNPC[tid] + sizeof(TheISA::MachInst);
-#else
-                nextPC[tid] = nextPC[tid] + sizeof(TheISA::MachInst);
-#endif
+                microPC[tid] = nextMicroPC[tid];
+                nextMicroPC[tid] = microPC[tid] + 1;

 #if FULL_SYSTEM
                int count = 0;
@ -1164,37 +1138,13 @@ DefaultCommit<Impl>::getInsts()
 {
    DPRINTF(Commit, "Getting instructions from Rename stage.\n");

-#if ISA_HAS_DELAY_SLOT
-    // Read any renamed instructions and place them into the ROB.
-    int insts_to_process = std::min((int)renameWidth,
-                               (int)(fromRename->size + skidBuffer.size()));
-    int rename_idx = 0;
-
-    DPRINTF(Commit, "%i insts available to process. Rename Insts:%i "
-            "SkidBuffer Insts:%i\n", insts_to_process, fromRename->size,
-            skidBuffer.size());
-#else
    // Read any renamed instructions and place them into the ROB.
    int insts_to_process = std::min((int)renameWidth, fromRename->size);
-#endif
-

    for (int inst_num = 0; inst_num < insts_to_process; ++inst_num) {
        DynInstPtr inst;

-#if ISA_HAS_DELAY_SLOT
-        // Get insts from skidBuffer or from Rename
-        if (skidBuffer.size() > 0) {
-            DPRINTF(Commit, "Grabbing skidbuffer inst.\n");
-            inst = skidBuffer.front();
-            skidBuffer.pop();
-        } else {
-            DPRINTF(Commit, "Grabbing rename inst.\n");
-            inst = fromRename->insts[rename_idx++];
-        }
-#else
        inst = fromRename->insts[inst_num];
-#endif
        int tid = inst->threadNumber;

        if (!inst->isSquashed() &&
@ -1216,30 +1166,6 @@ DefaultCommit<Impl>::getInsts()
                    inst->readPC(), inst->seqNum, tid);
        }
    }
-
-#if ISA_HAS_DELAY_SLOT
-    if (rename_idx < fromRename->size) {
-        DPRINTF(Commit,"Placing Rename Insts into skidBuffer.\n");
-
-        for (;
-             rename_idx < fromRename->size;
-             rename_idx++) {
-            DynInstPtr inst = fromRename->insts[rename_idx];
-
-            if (!inst->isSquashed()) {
-                DPRINTF(Commit, "Inserting PC %#x [sn:%i] [tid:%i] into ",
-                        "skidBuffer.\n", inst->readPC(), inst->seqNum,
-                        inst->threadNumber);
-                skidBuffer.push(inst);
-            } else {
-                DPRINTF(Commit, "Instruction PC %#x [sn:%i] [tid:%i] was "
-                        "squashed, skipping.\n",
-                        inst->readPC(), inst->seqNum, inst->threadNumber);
-            }
-        }
-    }
-#endif
-
 }

 template <class Impl>
--- a/src/cpu/o3/cpu.cc
+++ b/src/cpu/o3/cpu.cc
@ -696,7 +696,7 @@ FullO3CPU<Impl>::removeThread(unsigned tid)

    // Squash Throughout Pipeline
    InstSeqNum squash_seq_num = commit.rob->readHeadInst(tid)->seqNum;
-    fetch.squash(0, sizeof(TheISA::MachInst), squash_seq_num, true, tid);
+    fetch.squash(0, sizeof(TheISA::MachInst), 0, squash_seq_num, tid);
    decode.squash(tid);
    rename.squash(squash_seq_num, tid);
    iew.squash(tid);
@ -1150,6 +1150,20 @@ FullO3CPU<Impl>::setPC(Addr new_PC,unsigned tid)
    commit.setPC(new_PC, tid);
 }

+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readMicroPC(unsigned tid)
+{
+    return commit.readMicroPC(tid);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::setMicroPC(Addr new_PC,unsigned tid)
+{
+    commit.setMicroPC(new_PC, tid);
+}
+
 template <class Impl>
 uint64_t
 FullO3CPU<Impl>::readNextPC(unsigned tid)
@ -1178,6 +1192,20 @@ FullO3CPU<Impl>::setNextNPC(uint64_t val,unsigned tid)
    commit.setNextNPC(val, tid);
 }

+template <class Impl>
+uint64_t
+FullO3CPU<Impl>::readNextMicroPC(unsigned tid)
+{
+    return commit.readNextMicroPC(tid);
+}
+
+template <class Impl>
+void
+FullO3CPU<Impl>::setNextMicroPC(Addr new_PC,unsigned tid)
+{
+    commit.setNextMicroPC(new_PC, tid);
+}
+
 template <class Impl>
 typename FullO3CPU<Impl>::ListIt
 FullO3CPU<Impl>::addInst(DynInstPtr &inst)
@ -1226,9 +1254,7 @@ FullO3CPU<Impl>::removeFrontInst(DynInstPtr &inst)

 template <class Impl>
 void
-FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid,
-                                     bool squash_delay_slot,
-                                     const InstSeqNum &delay_slot_seq_num)
+FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid)
 {
    DPRINTF(O3CPU, "Thread %i: Deleting instructions from instruction"
            " list.\n", tid);
@ -1259,12 +1285,6 @@ FullO3CPU<Impl>::removeInstsNotInROB(unsigned tid,
    while (inst_it != end_it) {
        assert(!instList.empty());

-#if ISA_HAS_DELAY_SLOT
-        if(!squash_delay_slot &&
-           delay_slot_seq_num >= (*inst_it)->seqNum) {
-            break;
-        }
-#endif
        squashInstIt(inst_it, tid);

        inst_it--;
--- a/src/cpu/o3/cpu.hh
+++ b/src/cpu/o3/cpu.hh
@ -433,22 +433,34 @@ class FullO3CPU : public BaseO3CPU
    void setArchFloatRegInt(int reg_idx, uint64_t val, unsigned tid);

    /** Reads the commit PC of a specific thread. */
-    uint64_t readPC(unsigned tid);
+    Addr readPC(unsigned tid);

    /** Sets the commit PC of a specific thread. */
    void setPC(Addr new_PC, unsigned tid);

+    /** Reads the commit micro PC of a specific thread. */
+    Addr readMicroPC(unsigned tid);
+
+    /** Sets the commmit micro PC of a specific thread. */
+    void setMicroPC(Addr new_microPC, unsigned tid);
+
    /** Reads the next PC of a specific thread. */
-    uint64_t readNextPC(unsigned tid);
+    Addr readNextPC(unsigned tid);

    /** Sets the next PC of a specific thread. */
-    void setNextPC(uint64_t val, unsigned tid);
+    void setNextPC(Addr val, unsigned tid);

    /** Reads the next NPC of a specific thread. */
-    uint64_t readNextNPC(unsigned tid);
+    Addr readNextNPC(unsigned tid);

    /** Sets the next NPC of a specific thread. */
-    void setNextNPC(uint64_t val, unsigned tid);
+    void setNextNPC(Addr val, unsigned tid);
+
+    /** Reads the commit next micro PC of a specific thread. */
+    Addr readNextMicroPC(unsigned tid);
+
+    /** Sets the commit next micro PC of a specific thread. */
+    void setNextMicroPC(Addr val, unsigned tid);

    /** Function to add instruction onto the head of the list of the
     *  instructions.  Used when new instructions are fetched.
@ -468,8 +480,7 @@ class FullO3CPU : public BaseO3CPU

    /** Remove all instructions that are not currently in the ROB.
     *  There's also an option to not squash delay slot instructions.*/
-    void removeInstsNotInROB(unsigned tid, bool squash_delay_slot,
-                             const InstSeqNum &delay_slot_seq_num);
+    void removeInstsNotInROB(unsigned tid);

    /** Remove all instructions younger than the given sequence number. */
    void removeInstsUntil(const InstSeqNum &seq_num,unsigned tid);
--- a/src/cpu/o3/decode_impl.hh
+++ b/src/cpu/o3/decode_impl.hh
@ -49,8 +49,6 @@ DefaultDecode<Impl>::DefaultDecode(O3CPU *_cpu, Params *params)
        stalls[i].rename = false;
        stalls[i].iew = false;
        stalls[i].commit = false;
-
-        squashAfterDelaySlot[i] = false;
    }

    // @todo: Make into a parameter
@ -275,20 +273,16 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
    ///explicitly for ISAs with delay slots.
    toFetch->decodeInfo[tid].nextNPC =
        inst->branchTarget() + sizeof(TheISA::MachInst);
+    toFetch->decodeInfo[tid].nextMicroPC = inst->readMicroPC();
 #if ISA_HAS_DELAY_SLOT
    toFetch->decodeInfo[tid].branchTaken = inst->readNextNPC() !=
        (inst->readNextPC() + sizeof(TheISA::MachInst));
-
-    toFetch->decodeInfo[tid].bdelayDoneSeqNum = bdelayDoneSeqNum[tid];
-    squashAfterDelaySlot[tid] = false;
-
-    InstSeqNum squash_seq_num = bdelayDoneSeqNum[tid];
 #else
    toFetch->decodeInfo[tid].branchTaken =
        inst->readNextPC() != (inst->readPC() + sizeof(TheISA::MachInst));
+#endif

    InstSeqNum squash_seq_num = inst->seqNum;
-#endif

    // Might have to tell fetch to unblock.
    if (decodeStatus[tid] == Blocked ||
@ -309,30 +303,10 @@ DefaultDecode<Impl>::squash(DynInstPtr &inst, unsigned tid)
    // Clear the instruction list and skid buffer in case they have any
    // insts in them.
    while (!insts[tid].empty()) {
-
-#if ISA_HAS_DELAY_SLOT
-        if (insts[tid].front()->seqNum <= squash_seq_num) {
-            DPRINTF(Decode, "[tid:%i]: Cannot remove incoming decode "
-                    "instructions before delay slot [sn:%i]. %i insts"
-                    "left in decode.\n", tid, squash_seq_num,
-                    insts[tid].size());
-            break;
-        }
-#endif
        insts[tid].pop();
    }

    while (!skidBuffer[tid].empty()) {
-
-#if ISA_HAS_DELAY_SLOT
-        if (skidBuffer[tid].front()->seqNum <= squash_seq_num) {
-            DPRINTF(Decode, "[tid:%i]: Cannot remove skidBuffer "
-                    "instructions before delay slot [sn:%i]. %i insts"
-                    "left in decode.\n", tid, squash_seq_num,
-                    insts[tid].size());
-            break;
-        }
-#endif
        skidBuffer[tid].pop();
    }

@ -760,48 +734,13 @@ DefaultDecode<Impl>::decodeInsts(unsigned tid)

                // Might want to set some sort of boolean and just do
                // a check at the end
-#if !ISA_HAS_DELAY_SLOT
                squash(inst, inst->threadNumber);
                Addr target = inst->branchTarget();
-                inst->setPredTarg(target, target + sizeof(TheISA::MachInst));
+                //The micro pc after an instruction level branch should be 0
+                inst->setPredTarg(target, target + sizeof(TheISA::MachInst), 0);
                break;
-#else
-                // If mispredicted as taken, then ignore delay slot
-                // instruction... else keep delay slot and squash
-                // after it is sent to rename
-                if (inst->readPredTaken() && inst->isCondDelaySlot()) {
-                    DPRINTF(Decode, "[tid:%i]: Conditional delay slot inst."
-                            "[sn:%i] PC %#x mispredicted as taken.\n", tid,
-                            inst->seqNum, inst->PC);
-                    bdelayDoneSeqNum[tid] = inst->seqNum;
-                    squash(inst, inst->threadNumber);
-                    Addr target = inst->branchTarget();
-                    inst->setPredTarg(target,
-                            target + sizeof(TheISA::MachInst));
-                    break;
-                } else {
-                    DPRINTF(Decode, "[tid:%i]: Misprediction detected at "
-                            "[sn:%i] PC %#x, will squash after delay slot "
-                            "inst. is sent to Rename\n",
-                            tid, inst->seqNum, inst->PC);
-                    bdelayDoneSeqNum[tid] = inst->seqNum + 1;
-                    squashAfterDelaySlot[tid] = true;
-                    squashInst[tid] = inst;
-                    continue;
-                }
-#endif
            }
        }
-
-        if (squashAfterDelaySlot[tid]) {
-            assert(!inst->isSquashed());
-            squash(squashInst[tid], squashInst[tid]->threadNumber);
-            Addr target = squashInst[tid]->branchTarget();
-            squashInst[tid]->setPredTarg(target,
-                    target + sizeof(TheISA::MachInst));
-            assert(!inst->isSquashed());
-            break;
-        }
    }

    // If we didn't process all instructions, then we will need to block
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@ -227,7 +227,7 @@ class DefaultFetch
     * @param next_NPC Used for ISAs which use delay slots.
     * @return Whether or not a branch was predicted as taken.
     */
-    bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, Addr &next_NPC);
+    bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC, Addr &next_NPC, Addr &next_MicroPC);

    /**
     * Fetches the cache line that contains fetch_PC.  Returns any
@ -242,12 +242,14 @@ class DefaultFetch
    bool fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid);

    /** Squashes a specific thread and resets the PC. */
-    inline void doSquash(const Addr &new_PC, const Addr &new_NPC, unsigned tid);
+    inline void doSquash(const Addr &new_PC, const Addr &new_NPC,
+                         const Addr &new_MicroPC, unsigned tid);

    /** Squashes a specific thread and resets the PC. Also tells the CPU to
     * remove any instructions between fetch and decode that should be sqaushed.
     */
    void squashFromDecode(const Addr &new_PC, const Addr &new_NPC,
+                          const Addr &new_MicroPC,
                          const InstSeqNum &seq_num, unsigned tid);

    /** Checks if a thread is stalled. */
@ -263,8 +265,8 @@ class DefaultFetch
     * squash should be the commit stage.
     */
    void squash(const Addr &new_PC, const Addr &new_NPC,
-                const InstSeqNum &seq_num,
-                bool squash_delay_slot, unsigned tid);
+                const Addr &new_MicroPC,
+                const InstSeqNum &seq_num, unsigned tid);

    /** Ticks the fetch stage, processing all inputs signals and fetching
     * as many instructions as possible.
@ -347,16 +349,12 @@ class DefaultFetch
    /** Per-thread fetch PC. */
    Addr PC[Impl::MaxThreads];

+    /** Per-thread fetch micro PC. */
+    Addr microPC[Impl::MaxThreads];
+
    /** Per-thread next PC. */
    Addr nextPC[Impl::MaxThreads];

-    /** Per-thread next Next PC.
-     *  This is not a real register but is used for
-     *  architectures that use a branch-delay slot.
-     *  (such as MIPS or Sparc)
-     */
-    Addr nextNPC[Impl::MaxThreads];
-
    /** Memory request used to access cache. */
    RequestPtr memReq[Impl::MaxThreads];

--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@ -312,7 +312,7 @@ DefaultFetch<Impl>::initStage()
    for (int tid = 0; tid < numThreads; tid++) {
        PC[tid] = cpu->readPC(tid);
        nextPC[tid] = cpu->readNextPC(tid);
-        nextNPC[tid] = cpu->readNextNPC(tid);
+        microPC[tid] = cpu->readMicroPC(tid);
    }

    for (int tid=0; tid < numThreads; tid++) {
@ -439,11 +439,7 @@ DefaultFetch<Impl>::takeOverFrom()
        stalls[i].commit = 0;
        PC[i] = cpu->readPC(i);
        nextPC[i] = cpu->readNextPC(i);
-#if ISA_HAS_DELAY_SLOT
-        nextNPC[i] = cpu->readNextNPC(i);
-#else
-        nextNPC[i] = nextPC[i] + sizeof(TheISA::MachInst);
-#endif
+        microPC[i] = cpu->readMicroPC(i);
        fetchStatus[i] = Running;
    }
    numInst = 0;
@ -493,7 +489,7 @@ DefaultFetch<Impl>::switchToInactive()
 template <class Impl>
 bool
 DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC,
-                                          Addr &next_NPC)
+                                          Addr &next_NPC, Addr &next_MicroPC)
 {
    // Do branch prediction check here.
    // A bit of a misnomer...next_PC is actually the current PC until
@ -501,13 +497,22 @@ DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC,
    bool predict_taken;

    if (!inst->isControl()) {
-        next_PC  = next_NPC;
-        next_NPC = next_NPC + instSize;
-        inst->setPredTarg(next_PC, next_NPC);
+        if (inst->isMicroOp() && !inst->isLastMicroOp()) {
+            next_MicroPC++;
+        } else {
+            next_PC  = next_NPC;
+            next_NPC = next_NPC + instSize;
+            next_MicroPC = 0;
+        }
+        inst->setPredTarg(next_PC, next_NPC, next_MicroPC);
        inst->setPredTaken(false);
        return false;
    }

+    //Assume for now that all control flow is to a different macroop which
+    //would reset the micro pc to 0.
+    next_MicroPC = 0;
+
    int tid = inst->threadNumber;
    Addr pred_PC = next_PC;
    predict_taken = branchPred.predict(inst, pred_PC, tid);
@ -534,7 +539,7 @@ DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC,
 #endif
 /*    DPRINTF(Fetch, "[tid:%i]: Branch predicted to go to %#x and then %#x.\n",
            tid, next_PC, next_NPC);*/
-    inst->setPredTarg(next_PC, next_NPC);
+    inst->setPredTarg(next_PC, next_NPC, next_MicroPC);
    inst->setPredTaken(predict_taken);

    ++fetchedBranches;
@ -658,14 +663,14 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
 template <class Impl>
 inline void
 DefaultFetch<Impl>::doSquash(const Addr &new_PC,
-        const Addr &new_NPC, unsigned tid)
+        const Addr &new_NPC, const Addr &new_microPC, unsigned tid)
 {
    DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x, NPC to: %#x.\n",
            tid, new_PC, new_NPC);

    PC[tid] = new_PC;
    nextPC[tid] = new_NPC;
-    nextNPC[tid] = new_NPC + instSize;
+    microPC[tid] = new_microPC;

    // Clear the icache miss if it's outstanding.
    if (fetchStatus[tid] == IcacheWaitResponse) {
@ -693,12 +698,12 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC,
 template<class Impl>
 void
 DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC, const Addr &new_NPC,
-                                     const InstSeqNum &seq_num,
-                                     unsigned tid)
+                                     const Addr &new_MicroPC,
+                                     const InstSeqNum &seq_num, unsigned tid)
 {
    DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid);

-    doSquash(new_PC, new_NPC, tid);
+    doSquash(new_PC, new_NPC, new_MicroPC, tid);

    // Tell the CPU to remove any instructions that are in flight between
    // fetch and decode.
@ -774,20 +779,15 @@ DefaultFetch<Impl>::updateFetchStatus()
 template <class Impl>
 void
 DefaultFetch<Impl>::squash(const Addr &new_PC, const Addr &new_NPC,
-                           const InstSeqNum &seq_num,
-                           bool squash_delay_slot, unsigned tid)
+                           const Addr &new_MicroPC,
+                           const InstSeqNum &seq_num, unsigned tid)
 {
    DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid);

-    doSquash(new_PC, new_NPC, tid);
+    doSquash(new_PC, new_NPC, new_MicroPC, tid);

-#if ISA_HAS_DELAY_SLOT
    // Tell the CPU to remove any instructions that are not in the ROB.
-    cpu->removeInstsNotInROB(tid, squash_delay_slot, seq_num);
-#else
-    // Tell the CPU to remove any instructions that are not in the ROB.
-    cpu->removeInstsNotInROB(tid, true, 0);
-#endif
+    cpu->removeInstsNotInROB(tid);
 }

 template <class Impl>
@ -896,17 +896,11 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)

        DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash "
                "from commit.\n",tid);
-
-#if ISA_HAS_DELAY_SLOT
-    InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].bdelayDoneSeqNum;
-#else
-    InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].doneSeqNum;
-#endif
        // In any case, squash.
        squash(fromCommit->commitInfo[tid].nextPC,
               fromCommit->commitInfo[tid].nextNPC,
-               doneSeqNum,
-               fromCommit->commitInfo[tid].squashDelaySlot,
+               fromCommit->commitInfo[tid].nextMicroPC,
+               fromCommit->commitInfo[tid].doneSeqNum,
               tid);

        // Also check if there's a mispredict that happened.
@ -955,18 +949,14 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)

        if (fetchStatus[tid] != Squashing) {

-#if ISA_HAS_DELAY_SLOT
-            InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].bdelayDoneSeqNum;
-#else
-            InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].doneSeqNum;
-#endif
            DPRINTF(Fetch, "Squashing from decode with PC = %#x, NPC = %#x\n",
                    fromDecode->decodeInfo[tid].nextPC,
                    fromDecode->decodeInfo[tid].nextNPC);
            // Squash unless we're already squashing
            squashFromDecode(fromDecode->decodeInfo[tid].nextPC,
                             fromDecode->decodeInfo[tid].nextNPC,
-                             doneSeqNum,
+                             fromDecode->decodeInfo[tid].nextMicroPC,
+                             fromDecode->decodeInfo[tid].doneSeqNum,
                             tid);

            return true;
@ -1020,9 +1010,9 @@ DefaultFetch<Impl>::fetch(bool &status_change)
    DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid);

    // The current PC.
-    Addr &fetch_PC = PC[tid];
-
-    Addr &fetch_NPC = nextPC[tid];
+    Addr fetch_PC = PC[tid];
+    Addr fetch_NPC = nextPC[tid];
+    Addr fetch_MicroPC = microPC[tid];

    // Fault code for memory access.
    Fault fault = NoFault;
@ -1081,6 +1071,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)

    Addr next_PC = fetch_PC;
    Addr next_NPC = fetch_NPC;
+    Addr next_MicroPC = fetch_MicroPC;

    InstSeqNum inst_seq;
    MachInst inst;
@ -1088,6 +1079,9 @@ DefaultFetch<Impl>::fetch(bool &status_change)
    // @todo: Fix this hack.
    unsigned offset = (fetch_PC & cacheBlkMask) & ~3;

+    StaticInstPtr staticInst = NULL;
+    StaticInstPtr macroop = NULL;
+
    if (fault == NoFault) {
        // If the read of the first instruction was successful, then grab the
        // instructions from the rest of the cache line and put them into the
@ -1100,11 +1094,9 @@ DefaultFetch<Impl>::fetch(bool &status_change)
        // ended this fetch block.
        bool predicted_branch = false;

-        for (;
-             offset < cacheBlkSize &&
-                 numInst < fetchWidth &&
-                 !predicted_branch;
-             ++numInst) {
+        while (offset < cacheBlkSize &&
+               numInst < fetchWidth &&
+               !predicted_branch) {

            // If we're branching after this instruction, quite fetching
            // from the same block then.
@ -1115,91 +1107,103 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                        fetch_PC, fetch_NPC);
            }

-
-            // Get a sequence number.
-            inst_seq = cpu->getAndIncrementInstSeq();
-
            // Make sure this is a valid index.
            assert(offset <= cacheBlkSize - instSize);

-            // Get the instruction from the array of the cache line.
-            inst = TheISA::gtoh(*reinterpret_cast<TheISA::MachInst *>
-                        (&cacheData[tid][offset]));
+            if (!macroop) {
+                // Get the instruction from the array of the cache line.
+                inst = TheISA::gtoh(*reinterpret_cast<TheISA::MachInst *>
+                            (&cacheData[tid][offset]));

-            predecoder.setTC(cpu->thread[tid]->getTC());
-            predecoder.moreBytes(fetch_PC, 0, inst);
+                predecoder.setTC(cpu->thread[tid]->getTC());
+                predecoder.moreBytes(fetch_PC, 0, inst);

-            ext_inst = predecoder.getExtMachInst();
-
-            // Create a new DynInst from the instruction fetched.
-            DynInstPtr instruction = new DynInst(ext_inst,
-                                                 fetch_PC, fetch_NPC,
-                                                 next_PC, next_NPC,
-                                                 inst_seq, cpu);
-            instruction->setTid(tid);
-
-            instruction->setASID(tid);
-
-            instruction->setThreadState(cpu->thread[tid]);
-
-            DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created "
-                    "[sn:%lli]\n",
-                    tid, instruction->readPC(), inst_seq);
-
-            //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst);
-
-            DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n",
-                    tid, instruction->staticInst->disassemble(fetch_PC));
-
-            instruction->traceData =
-                Trace::getInstRecord(curTick, cpu->tcBase(tid),
-                                     instruction->staticInst,
-                                     instruction->readPC());
-
-            ///FIXME This needs to be more robust in dealing with delay slots
-#if !ISA_HAS_DELAY_SLOT
-//	    predicted_branch |=
-#endif
-            lookupAndUpdateNextPC(instruction, next_PC, next_NPC);
-            predicted_branch |= (next_PC != fetch_NPC);
-
-            // Add instruction to the CPU's list of instructions.
-            instruction->setInstListIt(cpu->addInst(instruction));
-
-            // Write the instruction to the first slot in the queue
-            // that heads to decode.
-            toDecode->insts[numInst] = instruction;
-
-            toDecode->size++;
-
-            // Increment stat of fetched instructions.
-            ++fetchedInsts;
-
-            // Move to the next instruction, unless we have a branch.
-            fetch_PC = next_PC;
-            fetch_NPC = next_NPC;
-
-            if (instruction->isQuiesce()) {
-                DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!",
-                        curTick);
-                fetchStatus[tid] = QuiescePending;
-                ++numInst;
-                status_change = true;
-                break;
+                ext_inst = predecoder.getExtMachInst();
+                staticInst = StaticInstPtr(ext_inst);
+                if (staticInst->isMacroOp())
+                    macroop = staticInst;
            }
+            do {
+                if (macroop) {
+                    staticInst = macroop->fetchMicroOp(fetch_MicroPC);
+                    if (staticInst->isLastMicroOp())
+                        macroop = NULL;
+                }

+                // Get a sequence number.
+                inst_seq = cpu->getAndIncrementInstSeq();
+
+                // Create a new DynInst from the instruction fetched.
+                DynInstPtr instruction = new DynInst(staticInst,
+                                                     fetch_PC, fetch_NPC, fetch_MicroPC,
+                                                     next_PC, next_NPC, next_MicroPC,
+                                                     inst_seq, cpu);
+                instruction->setTid(tid);
+
+                instruction->setASID(tid);
+
+                instruction->setThreadState(cpu->thread[tid]);
+
+                DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created "
+                        "[sn:%lli]\n",
+                        tid, instruction->readPC(), inst_seq);
+
+                //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst);
+
+                DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n",
+                        tid, instruction->staticInst->disassemble(fetch_PC));
+
+                instruction->traceData =
+                    Trace::getInstRecord(curTick, cpu->tcBase(tid),
+                                         instruction->staticInst,
+                                         instruction->readPC());
+
+                ///FIXME This needs to be more robust in dealing with delay slots
+                predicted_branch |=
+                    lookupAndUpdateNextPC(instruction, next_PC, next_NPC, next_MicroPC);
+
+                // Add instruction to the CPU's list of instructions.
+                instruction->setInstListIt(cpu->addInst(instruction));
+
+                // Write the instruction to the first slot in the queue
+                // that heads to decode.
+                toDecode->insts[numInst] = instruction;
+
+                toDecode->size++;
+
+                // Increment stat of fetched instructions.
+                ++fetchedInsts;
+
+                // Move to the next instruction, unless we have a branch.
+                fetch_PC = next_PC;
+                fetch_NPC = next_NPC;
+                fetch_MicroPC = next_MicroPC;
+
+                if (instruction->isQuiesce()) {
+                    DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!",
+                            curTick);
+                    fetchStatus[tid] = QuiescePending;
+                    ++numInst;
+                    status_change = true;
+                    break;
+                }
+
+                ++numInst;
+            } while (staticInst->isMicroOp() &&
+                     !staticInst->isLastMicroOp() &&
+                     numInst < fetchWidth);
            offset += instSize;
        }

-        if (offset >= cacheBlkSize) {
-            DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache "
-                    "block.\n", tid);
+        if (predicted_branch) {
+            DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch "
+                    "instruction encountered.\n", tid);
        } else if (numInst >= fetchWidth) {
            DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth "
                    "for this cycle.\n", tid);
-        } else if (predicted_branch) {
-            DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch "
-                    "instruction encountered.\n", tid);
+        } else if (offset >= cacheBlkSize) {
+            DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache "
+                    "block.\n", tid);
        }
    }

@ -1212,12 +1216,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
    if (fault == NoFault) {
        PC[tid] = next_PC;
        nextPC[tid] = next_NPC;
-        nextNPC[tid] = next_NPC + instSize;
-#if ISA_HAS_DELAY_SLOT
-        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, PC[tid]);
-#else
+        microPC[tid] = next_MicroPC;
        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, next_PC);
-#endif
    } else {
        // We shouldn't be in an icache miss and also have a fault (an ITB
        // miss)
@ -1235,8 +1235,9 @@ DefaultFetch<Impl>::fetch(bool &status_change)
        // We will use a nop in order to carry the fault.
        ext_inst = TheISA::NoopMachInst;

+        StaticInstPtr staticInst = new StaticInst(ext_inst);
        // Create a new DynInst from the dummy nop.
-        DynInstPtr instruction = new DynInst(ext_inst,
+        DynInstPtr instruction = new DynInst(staticInst,
                                             fetch_PC, fetch_NPC,
                                             next_PC, next_NPC,
                                             inst_seq, cpu);
--- a/src/cpu/o3/iew.hh
+++ b/src/cpu/o3/iew.hh
@ -402,9 +402,6 @@ class DefaultIEW
    /** Records if there is a fetch redirect on this cycle for each thread. */
    bool fetchRedirect[Impl::MaxThreads];

-    /** Keeps track of the last valid branch delay slot instss for threads */
-    InstSeqNum bdelayDoneSeqNum[Impl::MaxThreads];
-
    /** Used to track if all instructions have been dispatched this cycle.
     * If they have not, then blocking must have occurred, and the instructions
     * would already be added to the skid buffer.
--- a/src/cpu/o3/iew_impl.hh
+++ b/src/cpu/o3/iew_impl.hh
@ -69,7 +69,6 @@ DefaultIEW<Impl>::DefaultIEW(O3CPU *_cpu, Params *params)
        dispatchStatus[i] = Running;
        stalls[i].commit = false;
        fetchRedirect[i] = false;
-        bdelayDoneSeqNum[i] = 0;
    }

    wbMax = wbWidth * params->wbDepth;
@ -410,31 +409,14 @@ DefaultIEW<Impl>::squash(unsigned tid)
    instQueue.squash(tid);

    // Tell the LDSTQ to start squashing.
-#if ISA_HAS_DELAY_SLOT
-    ldstQueue.squash(fromCommit->commitInfo[tid].bdelayDoneSeqNum, tid);
-#else
    ldstQueue.squash(fromCommit->commitInfo[tid].doneSeqNum, tid);
-#endif
    updatedQueues = true;

    // Clear the skid buffer in case it has any data in it.
    DPRINTF(IEW, "[tid:%i]: Removing skidbuffer instructions until [sn:%i].\n",
-            tid, fromCommit->commitInfo[tid].bdelayDoneSeqNum);
+            tid, fromCommit->commitInfo[tid].doneSeqNum);

    while (!skidBuffer[tid].empty()) {
-#if ISA_HAS_DELAY_SLOT
-        if (skidBuffer[tid].front()->seqNum <=
-            fromCommit->commitInfo[tid].bdelayDoneSeqNum) {
-            DPRINTF(IEW, "[tid:%i]: Cannot remove skidbuffer instructions "
-                    "that occur before delay slot [sn:%i].\n",
-                    fromCommit->commitInfo[tid].bdelayDoneSeqNum,
-                    tid);
-            break;
-        } else {
-            DPRINTF(IEW, "[tid:%i]: Removing instruction [sn:%i] from "
-                    "skidBuffer.\n", tid, skidBuffer[tid].front()->seqNum);
-        }
-#endif
        if (skidBuffer[tid].front()->isLoad() ||
            skidBuffer[tid].front()->isStore() ) {
            toRename->iewInfo[tid].dispatchedToLSQ++;
@ -445,8 +427,6 @@ DefaultIEW<Impl>::squash(unsigned tid)
        skidBuffer[tid].pop();
    }

-    bdelayDoneSeqNum[tid] = fromCommit->commitInfo[tid].bdelayDoneSeqNum;
-
    emptyRenameInsts(tid);
 }

@ -462,38 +442,19 @@ DefaultIEW<Impl>::squashDueToBranch(DynInstPtr &inst, unsigned tid)
    toCommit->mispredPC[tid] = inst->readPC();
    toCommit->branchMispredict[tid] = true;

-    int instSize = sizeof(TheISA::MachInst);
 #if ISA_HAS_DELAY_SLOT
-    bool branch_taken =
+    int instSize = sizeof(TheISA::MachInst);
+    toCommit->branchTaken[tid] =
        !(inst->readNextPC() + instSize == inst->readNextNPC() &&
          (inst->readNextPC() == inst->readPC() + instSize ||
           inst->readNextPC() == inst->readPC() + 2 * instSize));
-    DPRINTF(Sparc, "Branch taken = %s [sn:%i]\n",
-            branch_taken ? "true": "false", inst->seqNum);
-
-    toCommit->branchTaken[tid] = branch_taken;
-
-    bool squashDelaySlot = true;
-//	(inst->readNextPC() != inst->readPC() + sizeof(TheISA::MachInst));
-    DPRINTF(Sparc, "Squash delay slot = %s [sn:%i]\n",
-            squashDelaySlot ? "true": "false", inst->seqNum);
-    toCommit->squashDelaySlot[tid] = squashDelaySlot;
-    //If we're squashing the delay slot, we need to pick back up at NextPC.
-    //Otherwise, NextPC isn't being squashed, so we should pick back up at
-    //NextNPC.
-    if (squashDelaySlot) {
-        toCommit->nextPC[tid] = inst->readNextPC();
-        toCommit->nextNPC[tid] = inst->readNextNPC();
-    } else {
-        toCommit->nextPC[tid] = inst->readNextNPC();
-        toCommit->nextNPC[tid] = inst->readNextNPC() + instSize;
-    }
 #else
    toCommit->branchTaken[tid] = inst->readNextPC() !=
        (inst->readPC() + sizeof(TheISA::MachInst));
-    toCommit->nextPC[tid] = inst->readNextPC();
-    toCommit->nextNPC[tid] = inst->readNextPC() + instSize;
 #endif
+    toCommit->nextPC[tid] = inst->readNextPC();
+    toCommit->nextNPC[tid] = inst->readNextNPC();
+    toCommit->nextMicroPC[tid] = inst->readNextMicroPC();

    toCommit->includeSquashInst[tid] = false;

@ -510,11 +471,7 @@ DefaultIEW<Impl>::squashDueToMemOrder(DynInstPtr &inst, unsigned tid)
    toCommit->squash[tid] = true;
    toCommit->squashedSeqNum[tid] = inst->seqNum;
    toCommit->nextPC[tid] = inst->readNextPC();
-#if ISA_HAS_DELAY_SLOT
    toCommit->nextNPC[tid] = inst->readNextNPC();
-#else
-    toCommit->nextNPC[tid] = inst->readNextPC() + sizeof(TheISA::MachInst);
-#endif
    toCommit->branchMispredict[tid] = false;

    toCommit->includeSquashInst[tid] = false;
@ -532,11 +489,7 @@ DefaultIEW<Impl>::squashDueToMemBlocked(DynInstPtr &inst, unsigned tid)
    toCommit->squash[tid] = true;
    toCommit->squashedSeqNum[tid] = inst->seqNum;
    toCommit->nextPC[tid] = inst->readPC();
-#if ISA_HAS_DELAY_SLOT
    toCommit->nextNPC[tid] = inst->readNextPC();
-#else
-    toCommit->nextNPC[tid] = inst->readPC() + sizeof(TheISA::MachInst);
-#endif
    toCommit->branchMispredict[tid] = false;

    // Must include the broadcasted SN in the squash.
@ -880,10 +833,8 @@ DefaultIEW<Impl>::sortInsts()
 {
    int insts_from_rename = fromRename->size;
 #ifdef DEBUG
-#if !ISA_HAS_DELAY_SLOT
    for (int i = 0; i < numThreads; i++)
        assert(insts[i].empty());
-#endif
 #endif
    for (int i = 0; i < insts_from_rename; ++i) {
        insts[fromRename->insts[i]->threadNumber].push(fromRename->insts[i]);
@ -894,21 +845,9 @@ template <class Impl>
 void
 DefaultIEW<Impl>::emptyRenameInsts(unsigned tid)
 {
-    DPRINTF(IEW, "[tid:%i]: Removing incoming rename instructions until "
-            "[sn:%i].\n", tid, bdelayDoneSeqNum[tid]);
+    DPRINTF(IEW, "[tid:%i]: Removing incoming rename instructions\n", tid);

    while (!insts[tid].empty()) {
-#if ISA_HAS_DELAY_SLOT
-        if (insts[tid].front()->seqNum <= bdelayDoneSeqNum[tid]) {
-            DPRINTF(IEW, "[tid:%i]: Done removing, cannot remove instruction"
-                    " that occurs at or before delay slot [sn:%i].\n",
-                    tid, bdelayDoneSeqNum[tid]);
-            break;
-        } else {
-            DPRINTF(IEW, "[tid:%i]: Removing incoming rename instruction "
-                    "[sn:%i].\n", tid, insts[tid].front()->seqNum);
-        }
-#endif

        if (insts[tid].front()->isLoad() ||
            insts[tid].front()->isStore() ) {
--- a/src/cpu/o3/inst_queue_impl.hh
+++ b/src/cpu/o3/inst_queue_impl.hh
@ -1005,11 +1005,7 @@ InstructionQueue<Impl>::squash(unsigned tid)

    // Read instruction sequence number of last instruction out of the
    // time buffer.
-#if ISA_HAS_DELAY_SLOT
-    squashedSeqNum[tid] = fromCommit->commitInfo[tid].bdelayDoneSeqNum;
-#else
    squashedSeqNum[tid] = fromCommit->commitInfo[tid].doneSeqNum;
-#endif

    // Call doSquash if there are insts in the IQ
    if (count[tid] > 0) {
--- a/src/cpu/o3/mips/dyn_inst.hh
+++ b/src/cpu/o3/mips/dyn_inst.hh
@ -69,10 +69,16 @@ class MipsDynInst : public BaseDynInst<Impl>
    };

  public:
+    /** BaseDynInst constructor given a binary instruction. */
+    MipsDynInst(StaticInstPtr staticInst,
+                Addr PC, Addr NPC, Addr microPC,
+                Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
+                InstSeqNum seq_num, O3CPU *cpu);
+
    /** BaseDynInst constructor given a binary instruction. */
    MipsDynInst(ExtMachInst inst,
-                Addr PC, Addr NPC,
-                Addr Pred_PC, Addr Pred_NPC,
+                Addr PC, Addr NPC, Addr microPC,
+                Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
                InstSeqNum seq_num, O3CPU *cpu);

    /** BaseDynInst constructor given a static inst pointer. */
--- a/src/cpu/o3/mips/dyn_inst_impl.hh
+++ b/src/cpu/o3/mips/dyn_inst_impl.hh
@ -31,11 +31,23 @@
 #include "cpu/o3/mips/dyn_inst.hh"

 template <class Impl>
-MipsDynInst<Impl>::MipsDynInst(ExtMachInst inst,
-                               Addr PC, Addr NPC,
-                               Addr Pred_PC, Addr Pred_NPC,
+MipsDynInst<Impl>::MipsDynInst(StaticInstPtr staticInst,
+                               Addr PC, Addr NPC, Addr microPC,
+                               Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
                               InstSeqNum seq_num, O3CPU *cpu)
-    : BaseDynInst<Impl>(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu)
+    : BaseDynInst<Impl>(staticInst, PC, NPC, microPC,
+            Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu)
+{
+    initVars();
+}
+
+template <class Impl>
+MipsDynInst<Impl>::MipsDynInst(ExtMachInst inst,
+                               Addr PC, Addr NPC, Addr microPC,
+                               Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
+                               InstSeqNum seq_num, O3CPU *cpu)
+    : BaseDynInst<Impl>(inst, PC, NPC, microPC,
+            Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu)
 {
    initVars();
 }
--- a/src/cpu/o3/rename_impl.hh
+++ b/src/cpu/o3/rename_impl.hh
@ -356,47 +356,12 @@ DefaultRename<Impl>::squash(const InstSeqNum &squash_seq_num, unsigned tid)
    }

    // Clear the instruction list and skid buffer in case they have any
-    // insts in them. Since we support multiple ISAs, we cant just:
-    // "insts[tid].clear();" or "skidBuffer[tid].clear()" since there is
-    // a possible delay slot inst for different architectures
-    // insts[tid].clear();
-#if ISA_HAS_DELAY_SLOT
-    DPRINTF(Rename, "[tid:%i] Squashing incoming decode instructions until "
-            "[sn:%i].\n",tid, squash_seq_num);
-    ListIt ilist_it = insts[tid].begin();
-    while (ilist_it != insts[tid].end()) {
-        if ((*ilist_it)->seqNum > squash_seq_num) {
-            (*ilist_it)->setSquashed();
-            DPRINTF(Rename, "Squashing incoming decode instruction, "
-                    "[tid:%i] [sn:%i] PC %08p.\n", tid, (*ilist_it)->seqNum, (*ilist_it)->PC);
-        }
-        ilist_it++;
-    }
-#else
+    // insts in them.
    insts[tid].clear();
-#endif

    // Clear the skid buffer in case it has any data in it.
-    // See comments above.
-    //     skidBuffer[tid].clear();
-#if ISA_HAS_DELAY_SLOT
-    DPRINTF(Rename, "[tid:%i] Squashing incoming skidbuffer instructions "
-            "until [sn:%i].\n", tid, squash_seq_num);
-    ListIt slist_it = skidBuffer[tid].begin();
-    while (slist_it != skidBuffer[tid].end()) {
-        if ((*slist_it)->seqNum > squash_seq_num) {
-            (*slist_it)->setSquashed();
-            DPRINTF(Rename, "Squashing skidbuffer instruction, [tid:%i] [sn:%i]"
-                    "PC %08p.\n", tid, (*slist_it)->seqNum, (*slist_it)->PC);
-        }
-        slist_it++;
-    }
-    resumeUnblocking = (skidBuffer[tid].size() != 0);
-    DPRINTF(Rename, "Resume unblocking set to %s\n",
-            resumeUnblocking ? "true" : "false");
-#else
    skidBuffer[tid].clear();
-#endif
+
    doSquash(squash_seq_num, tid);
 }

@ -776,10 +741,8 @@ DefaultRename<Impl>::sortInsts()
 {
    int insts_from_decode = fromDecode->size;
 #ifdef DEBUG
-#if !ISA_HAS_DELAY_SLOT
    for (int i=0; i < numThreads; i++)
        assert(insts[i].empty());
-#endif
 #endif
    for (int i = 0; i < insts_from_decode; ++i) {
        DynInstPtr inst = fromDecode->insts[i];
@ -1000,6 +963,7 @@ DefaultRename<Impl>::renameSrcRegs(DynInstPtr &inst,unsigned tid)
            // Floating point and Miscellaneous registers need their indexes
            // adjusted to account for the expanded number of flattened int regs.
            flat_src_reg = src_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs;
+            DPRINTF(Rename, "Adjusting reg index from %d to %d.\n", src_reg, flat_src_reg);
        }

        inst->flattenSrcReg(src_idx, flat_src_reg);
@ -1016,9 +980,11 @@ DefaultRename<Impl>::renameSrcRegs(DynInstPtr &inst,unsigned tid)

        // See if the register is ready or not.
        if (scoreboard->getReg(renamed_reg) == true) {
-            DPRINTF(Rename, "[tid:%u]: Register is ready.\n", tid);
+            DPRINTF(Rename, "[tid:%u]: Register %d is ready.\n", tid, renamed_reg);

            inst->markSrcRegReady(src_idx);
+        } else {
+            DPRINTF(Rename, "[tid:%u]: Register %d is not ready.\n", tid, renamed_reg);
        }

        ++renameRenameLookups;
@ -1045,6 +1011,7 @@ DefaultRename<Impl>::renameDestRegs(DynInstPtr &inst,unsigned tid)
            // Floating point and Miscellaneous registers need their indexes
            // adjusted to account for the expanded number of flattened int regs.
            flat_dest_reg = dest_reg - TheISA::FP_Base_DepTag + TheISA::NumIntRegs;
+            DPRINTF(Rename, "Adjusting reg index from %d to %d.\n", dest_reg, flat_dest_reg);
        }

        inst->flattenDestReg(dest_idx, flat_dest_reg);
@ -1248,13 +1215,7 @@ DefaultRename<Impl>::checkSignalsAndUpdate(unsigned tid)
        DPRINTF(Rename, "[tid:%u]: Squashing instructions due to squash from "
                "commit.\n", tid);

-#if ISA_HAS_DELAY_SLOT
-        InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].bdelayDoneSeqNum;
-#else
-        InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum;
-#endif
-
-        squash(squashed_seq_num, tid);
+        squash(fromCommit->commitInfo[tid].doneSeqNum, tid);

        return true;
    }
--- a/src/cpu/o3/sparc/dyn_inst.hh
+++ b/src/cpu/o3/sparc/dyn_inst.hh
@ -56,8 +56,14 @@ class SparcDynInst : public BaseDynInst<Impl>

  public:
    /** BaseDynInst constructor given a binary instruction. */
-    SparcDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC,
-            Addr Pred_PC, Addr Pred_NPC, InstSeqNum seq_num, O3CPU *cpu);
+    SparcDynInst(StaticInstPtr staticInst, Addr PC, Addr NPC, Addr microPC,
+            Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
+            InstSeqNum seq_num, O3CPU *cpu);
+
+    /** BaseDynInst constructor given a binary instruction. */
+    SparcDynInst(TheISA::ExtMachInst inst, Addr PC, Addr NPC, Addr microPC,
+            Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
+            InstSeqNum seq_num, O3CPU *cpu);

    /** BaseDynInst constructor given a static inst pointer. */
    SparcDynInst(StaticInstPtr &_staticInst);
--- a/src/cpu/o3/sparc/dyn_inst_impl.hh
+++ b/src/cpu/o3/sparc/dyn_inst_impl.hh
@ -31,10 +31,23 @@
 #include "cpu/o3/sparc/dyn_inst.hh"

 template <class Impl>
-SparcDynInst<Impl>::SparcDynInst(TheISA::ExtMachInst inst,
-        Addr PC, Addr NPC, Addr Pred_PC, Addr Pred_NPC,
+SparcDynInst<Impl>::SparcDynInst(StaticInstPtr staticInst,
+        Addr PC, Addr NPC, Addr microPC,
+        Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
        InstSeqNum seq_num, O3CPU *cpu)
-    : BaseDynInst<Impl>(inst, PC, NPC, Pred_PC, Pred_NPC, seq_num, cpu)
+    : BaseDynInst<Impl>(staticInst, PC, NPC, microPC,
+            Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu)
+{
+    initVars();
+}
+
+template <class Impl>
+SparcDynInst<Impl>::SparcDynInst(TheISA::ExtMachInst inst,
+        Addr PC, Addr NPC, Addr microPC,
+        Addr Pred_PC, Addr Pred_NPC, Addr Pred_MicroPC,
+        InstSeqNum seq_num, O3CPU *cpu)
+    : BaseDynInst<Impl>(inst, PC, NPC, microPC,
+            Pred_PC, Pred_NPC, Pred_MicroPC, seq_num, cpu)
 {
    initVars();
 }
--- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/m5stats.txt
@ -1,17 +1,17 @@

 ---------- Begin Simulation Statistics ----------
 global.BPredUnit.BTBCorrect                         0                       # Number of correct BTB predictions (this stat may not work properly.
-global.BPredUnit.BTBHits                         2990                       # Number of BTB hits
-global.BPredUnit.BTBLookups                      7055                       # Number of BTB lookups
+global.BPredUnit.BTBHits                         3021                       # Number of BTB hits
+global.BPredUnit.BTBLookups                      7086                       # Number of BTB lookups
 global.BPredUnit.RASInCorrect                       0                       # Number of incorrect RAS predictions.
 global.BPredUnit.condIncorrect                   2077                       # Number of conditional branches incorrect
-global.BPredUnit.condPredicted                   7846                       # Number of conditional branches predicted
-global.BPredUnit.lookups                         7846                       # Number of BP lookups
+global.BPredUnit.condPredicted                   7877                       # Number of conditional branches predicted
+global.BPredUnit.lookups                         7877                       # Number of BP lookups
 global.BPredUnit.usedRAS                            0                       # Number of times the RAS was used to get a target.
-host_inst_rate                                  15119                       # Simulator instruction rate (inst/s)
-host_mem_usage                                 154868                       # Number of bytes of host memory used
-host_seconds                                     0.73                       # Real time elapsed on the host
-host_tick_rate                                1956796                       # Simulator tick rate (ticks/s)
+host_inst_rate                                   4388                       # Simulator instruction rate (inst/s)
+host_mem_usage                                 179936                       # Number of bytes of host memory used
+host_seconds                                     2.50                       # Real time elapsed on the host
+host_tick_rate                                 568121                       # Simulator tick rate (ticks/s)
 memdepunit.memDep.conflictingLoads                 12                       # Number of conflicting loads.
 memdepunit.memDep.conflictingStores                 0                       # Number of conflicting stores.
 memdepunit.memDep.insertedLoads                  3250                       # Number of loads inserted to the mem dependence unit.
@ -19,22 +19,22 @@ memdepunit.memDep.insertedStores                 2817                       # Nu
 sim_freq                                 1000000000000                       # Frequency of simulated ticks
 sim_insts                                       10976                       # Number of instructions simulated
 sim_seconds                                  0.000001                       # Number of seconds simulated
-sim_ticks                                     1421211                       # Number of ticks simulated
+sim_ticks                                     1421207                       # Number of ticks simulated
 system.cpu.commit.COM:branches                   2152                       # Number of branches committed
-system.cpu.commit.COM:bw_lim_events               172                       # number cycles where commit BW limit reached
+system.cpu.commit.COM:bw_lim_events               225                       # number cycles where commit BW limit reached
 system.cpu.commit.COM:bw_limited                    0                       # number of insts not committed due to BW limits
 system.cpu.commit.COM:committed_per_cycle.start_dist                     # Number of insts commited each cycle
-system.cpu.commit.COM:committed_per_cycle.samples       221349                      
+system.cpu.commit.COM:committed_per_cycle.samples       220766                      
 system.cpu.commit.COM:committed_per_cycle.min_value            0                      
-                               0       215844   9751.30%           
-                               1         2970    134.18%           
-                               2         1290     58.28%           
-                               3          631     28.51%           
-                               4          208      9.40%           
-                               5           90      4.07%           
-                               6          133      6.01%           
+                               0       215368   9755.49%           
+                               1         2915    132.04%           
+                               2         1196     54.18%           
+                               3          673     30.48%           
+                               4          208      9.42%           
+                               5           79      3.58%           
+                               6           91      4.12%           
                               7           11      0.50%           
-                               8          172      7.77%           
+                               8          225     10.19%           
 system.cpu.commit.COM:committed_per_cycle.max_value            8                      
 system.cpu.commit.COM:committed_per_cycle.end_dist

@ -49,65 +49,65 @@ system.cpu.commit.commitNonSpecStalls             327                       # Th
 system.cpu.commit.commitSquashedInsts           14263                       # The number of squashed insts skipped by commit
 system.cpu.committedInsts                       10976                       # Number of Instructions Simulated
 system.cpu.committedInsts_total                 10976                       # Number of Instructions Simulated
-system.cpu.cpi                             129.483509                       # CPI: Cycles Per Instruction
-system.cpu.cpi_total                       129.483509                       # CPI: Total CPI of All Threads
-system.cpu.dcache.ReadReq_accesses               2737                       # number of ReadReq accesses(hits+misses)
-system.cpu.dcache.ReadReq_avg_miss_latency  6585.044776                       # average ReadReq miss latency
-system.cpu.dcache.ReadReq_avg_mshr_miss_latency  6511.939394                       # average ReadReq mshr miss latency
-system.cpu.dcache.ReadReq_hits                   2603                       # number of ReadReq hits
-system.cpu.dcache.ReadReq_miss_latency         882396                       # number of ReadReq miss cycles
-system.cpu.dcache.ReadReq_miss_rate          0.048959                       # miss rate for ReadReq accesses
+system.cpu.cpi                             129.483145                       # CPI: Cycles Per Instruction
+system.cpu.cpi_total                       129.483145                       # CPI: Total CPI of All Threads
+system.cpu.dcache.ReadReq_accesses               2738                       # number of ReadReq accesses(hits+misses)
+system.cpu.dcache.ReadReq_avg_miss_latency  6586.074627                       # average ReadReq miss latency
+system.cpu.dcache.ReadReq_avg_mshr_miss_latency  6513.166667                       # average ReadReq mshr miss latency
+system.cpu.dcache.ReadReq_hits                   2604                       # number of ReadReq hits
+system.cpu.dcache.ReadReq_miss_latency         882534                       # number of ReadReq miss cycles
+system.cpu.dcache.ReadReq_miss_rate          0.048941                       # miss rate for ReadReq accesses
 system.cpu.dcache.ReadReq_misses                  134                       # number of ReadReq misses
 system.cpu.dcache.ReadReq_mshr_hits                68                       # number of ReadReq MSHR hits
-system.cpu.dcache.ReadReq_mshr_miss_latency       429788                       # number of ReadReq MSHR miss cycles
-system.cpu.dcache.ReadReq_mshr_miss_rate     0.024114                       # mshr miss rate for ReadReq accesses
+system.cpu.dcache.ReadReq_mshr_miss_latency       429869                       # number of ReadReq MSHR miss cycles
+system.cpu.dcache.ReadReq_mshr_miss_rate     0.024105                       # mshr miss rate for ReadReq accesses
 system.cpu.dcache.ReadReq_mshr_misses              66                       # number of ReadReq MSHR misses
 system.cpu.dcache.SwapReq_accesses                  6                       # number of SwapReq accesses(hits+misses)
 system.cpu.dcache.SwapReq_hits                      6                       # number of SwapReq hits
 system.cpu.dcache.WriteReq_accesses              1292                       # number of WriteReq accesses(hits+misses)
-system.cpu.dcache.WriteReq_avg_miss_latency  7960.583924                       # average WriteReq miss latency
-system.cpu.dcache.WriteReq_avg_mshr_miss_latency  7136.918605                       # average WriteReq mshr miss latency
+system.cpu.dcache.WriteReq_avg_miss_latency  7962.583924                       # average WriteReq miss latency
+system.cpu.dcache.WriteReq_avg_mshr_miss_latency  7138.593023                       # average WriteReq mshr miss latency
 system.cpu.dcache.WriteReq_hits                   869                       # number of WriteReq hits
-system.cpu.dcache.WriteReq_miss_latency       3367327                       # number of WriteReq miss cycles
+system.cpu.dcache.WriteReq_miss_latency       3368173                       # number of WriteReq miss cycles
 system.cpu.dcache.WriteReq_miss_rate         0.327399                       # miss rate for WriteReq accesses
 system.cpu.dcache.WriteReq_misses                 423                       # number of WriteReq misses
 system.cpu.dcache.WriteReq_mshr_hits              337                       # number of WriteReq MSHR hits
-system.cpu.dcache.WriteReq_mshr_miss_latency       613775                       # number of WriteReq MSHR miss cycles
+system.cpu.dcache.WriteReq_mshr_miss_latency       613919                       # number of WriteReq MSHR miss cycles
 system.cpu.dcache.WriteReq_mshr_miss_rate     0.066563                       # mshr miss rate for WriteReq accesses
 system.cpu.dcache.WriteReq_mshr_misses             86                       # number of WriteReq MSHR misses
 system.cpu.dcache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
 system.cpu.dcache.avg_blocked_cycles_no_targets <err: div-0>                       # average number of cycles each access was blocked
-system.cpu.dcache.avg_refs                  22.881579                       # Average number of references to valid blocks.
+system.cpu.dcache.avg_refs                  22.888158                       # Average number of references to valid blocks.
 system.cpu.dcache.blocked_no_mshrs                  0                       # number of cycles access was blocked
 system.cpu.dcache.blocked_no_targets                0                       # number of cycles access was blocked
 system.cpu.dcache.blocked_cycles_no_mshrs            0                       # number of cycles access was blocked
 system.cpu.dcache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
 system.cpu.dcache.cache_copies                      0                       # number of cache copies performed
-system.cpu.dcache.demand_accesses                4029                       # number of demand (read+write) accesses
-system.cpu.dcache.demand_avg_miss_latency  7629.664273                       # average overall miss latency
-system.cpu.dcache.demand_avg_mshr_miss_latency  6865.546053                       # average overall mshr miss latency
-system.cpu.dcache.demand_hits                    3472                       # number of demand (read+write) hits
-system.cpu.dcache.demand_miss_latency         4249723                       # number of demand (read+write) miss cycles
-system.cpu.dcache.demand_miss_rate           0.138248                       # miss rate for demand accesses
+system.cpu.dcache.demand_accesses                4030                       # number of demand (read+write) accesses
+system.cpu.dcache.demand_avg_miss_latency  7631.430880                       # average overall miss latency
+system.cpu.dcache.demand_avg_mshr_miss_latency  6867.026316                       # average overall mshr miss latency
+system.cpu.dcache.demand_hits                    3473                       # number of demand (read+write) hits
+system.cpu.dcache.demand_miss_latency         4250707                       # number of demand (read+write) miss cycles
+system.cpu.dcache.demand_miss_rate           0.138213                       # miss rate for demand accesses
 system.cpu.dcache.demand_misses                   557                       # number of demand (read+write) misses
 system.cpu.dcache.demand_mshr_hits                405                       # number of demand (read+write) MSHR hits
-system.cpu.dcache.demand_mshr_miss_latency      1043563                       # number of demand (read+write) MSHR miss cycles
-system.cpu.dcache.demand_mshr_miss_rate      0.037726                       # mshr miss rate for demand accesses
+system.cpu.dcache.demand_mshr_miss_latency      1043788                       # number of demand (read+write) MSHR miss cycles
+system.cpu.dcache.demand_mshr_miss_rate      0.037717                       # mshr miss rate for demand accesses
 system.cpu.dcache.demand_mshr_misses              152                       # number of demand (read+write) MSHR misses
 system.cpu.dcache.fast_writes                       0                       # number of fast writes performed
 system.cpu.dcache.mshr_cap_events                   0                       # number of times MSHR cap was activated
 system.cpu.dcache.no_allocate_misses                0                       # Number of misses that were no-allocate
-system.cpu.dcache.overall_accesses               4029                       # number of overall (read+write) accesses
-system.cpu.dcache.overall_avg_miss_latency  7629.664273                       # average overall miss latency
-system.cpu.dcache.overall_avg_mshr_miss_latency  6865.546053                       # average overall mshr miss latency
+system.cpu.dcache.overall_accesses               4030                       # number of overall (read+write) accesses
+system.cpu.dcache.overall_avg_miss_latency  7631.430880                       # average overall miss latency
+system.cpu.dcache.overall_avg_mshr_miss_latency  6867.026316                       # average overall mshr miss latency
 system.cpu.dcache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
-system.cpu.dcache.overall_hits                   3472                       # number of overall hits
-system.cpu.dcache.overall_miss_latency        4249723                       # number of overall miss cycles
-system.cpu.dcache.overall_miss_rate          0.138248                       # miss rate for overall accesses
+system.cpu.dcache.overall_hits                   3473                       # number of overall hits
+system.cpu.dcache.overall_miss_latency        4250707                       # number of overall miss cycles
+system.cpu.dcache.overall_miss_rate          0.138213                       # miss rate for overall accesses
 system.cpu.dcache.overall_misses                  557                       # number of overall misses
 system.cpu.dcache.overall_mshr_hits               405                       # number of overall MSHR hits
-system.cpu.dcache.overall_mshr_miss_latency      1043563                       # number of overall MSHR miss cycles
-system.cpu.dcache.overall_mshr_miss_rate     0.037726                       # mshr miss rate for overall accesses
+system.cpu.dcache.overall_mshr_miss_latency      1043788                       # number of overall MSHR miss cycles
+system.cpu.dcache.overall_mshr_miss_rate     0.037717                       # mshr miss rate for overall accesses
 system.cpu.dcache.overall_mshr_misses             152                       # number of overall MSHR misses
 system.cpu.dcache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
 system.cpu.dcache.overall_mshr_uncacheable_misses            0                       # number of overall MSHR uncacheable misses
@ -123,50 +123,50 @@ system.cpu.dcache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.dcache.replacements                      0                       # number of replacements
 system.cpu.dcache.sampled_refs                    152                       # Sample count of references to valid blocks.
 system.cpu.dcache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.dcache.tagsinuse                 90.938737                       # Cycle average of tags in use
-system.cpu.dcache.total_refs                     3478                       # Total number of references to valid blocks.
+system.cpu.dcache.tagsinuse                 90.938565                       # Cycle average of tags in use
+system.cpu.dcache.total_refs                     3479                       # Total number of references to valid blocks.
 system.cpu.dcache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
 system.cpu.dcache.writebacks                        0                       # number of writebacks
-system.cpu.decode.DECODE:BlockedCycles         192719                       # Number of cycles decode is blocked
-system.cpu.decode.DECODE:DecodedInsts           39774                       # Number of instructions handled by decode
-system.cpu.decode.DECODE:IdleCycles             20128                       # Number of cycles decode is idle
-system.cpu.decode.DECODE:RunCycles               8238                       # Number of cycles decode is running
+system.cpu.decode.DECODE:BlockedCycles         192302                       # Number of cycles decode is blocked
+system.cpu.decode.DECODE:DecodedInsts           39763                       # Number of instructions handled by decode
+system.cpu.decode.DECODE:IdleCycles             19973                       # Number of cycles decode is idle
+system.cpu.decode.DECODE:RunCycles               8441                       # Number of cycles decode is running
 system.cpu.decode.DECODE:SquashCycles            3162                       # Number of cycles decode is squashing
-system.cpu.decode.DECODE:UnblockCycles            264                       # Number of cycles decode is unblocking
-system.cpu.fetch.Branches                        7846                       # Number of branches that fetch encountered
+system.cpu.decode.DECODE:UnblockCycles             50                       # Number of cycles decode is unblocking
+system.cpu.fetch.Branches                        7877                       # Number of branches that fetch encountered
 system.cpu.fetch.CacheLines                      5085                       # Number of cache lines fetched
-system.cpu.fetch.Cycles                         14399                       # Number of cycles fetch has run and was not squashing or blocked
+system.cpu.fetch.Cycles                         14430                       # Number of cycles fetch has run and was not squashing or blocked
 system.cpu.fetch.IcacheSquashes                   745                       # Number of outstanding Icache misses that were squashed
-system.cpu.fetch.Insts                          43304                       # Number of instructions fetch has processed
+system.cpu.fetch.Insts                          43366                       # Number of instructions fetch has processed
 system.cpu.fetch.SquashCycles                    2134                       # Number of cycles fetch has spent squashing
-system.cpu.fetch.branchRate                  0.034947                       # Number of branch fetches per cycle
+system.cpu.fetch.branchRate                  0.035176                       # Number of branch fetches per cycle
 system.cpu.fetch.icacheStallCycles               5085                       # Number of cycles fetch is stalled on an Icache miss
-system.cpu.fetch.predictedBranches               2990                       # Number of branches that fetch has predicted taken
-system.cpu.fetch.rate                        0.192881                       # Number of inst fetches per cycle
+system.cpu.fetch.predictedBranches               3021                       # Number of branches that fetch has predicted taken
+system.cpu.fetch.rate                        0.193660                       # Number of inst fetches per cycle
 system.cpu.fetch.rateDist.start_dist                           # Number of instructions fetched each cycle (Total)
-system.cpu.fetch.rateDist.samples              224511                      
+system.cpu.fetch.rateDist.samples              223928                      
 system.cpu.fetch.rateDist.min_value                 0                      
-                               0       215198   9585.19%           
-                               1         2258    100.57%           
-                               2          627     27.93%           
-                               3          958     42.67%           
-                               4          553     24.63%           
-                               5          816     36.35%           
-                               6          951     42.36%           
-                               7          280     12.47%           
-                               8         2870    127.83%           
+                               0       214584   9582.72%           
+                               1         2258    100.84%           
+                               2          658     29.38%           
+                               3          958     42.78%           
+                               4          553     24.70%           
+                               5          816     36.44%           
+                               6          951     42.47%           
+                               7          280     12.50%           
+                               8         2870    128.17%           
 system.cpu.fetch.rateDist.max_value                 8                      
 system.cpu.fetch.rateDist.end_dist

 system.cpu.icache.ReadReq_accesses               5085                       # number of ReadReq accesses(hits+misses)
-system.cpu.icache.ReadReq_avg_miss_latency  5148.266776                       # average ReadReq miss latency
-system.cpu.icache.ReadReq_avg_mshr_miss_latency  4502.972752                       # average ReadReq mshr miss latency
+system.cpu.icache.ReadReq_avg_miss_latency  5150.152209                       # average ReadReq miss latency
+system.cpu.icache.ReadReq_avg_mshr_miss_latency  4503.673025                       # average ReadReq mshr miss latency
 system.cpu.icache.ReadReq_hits                   4474                       # number of ReadReq hits
-system.cpu.icache.ReadReq_miss_latency        3145591                       # number of ReadReq miss cycles
+system.cpu.icache.ReadReq_miss_latency        3146743                       # number of ReadReq miss cycles
 system.cpu.icache.ReadReq_miss_rate          0.120157                       # miss rate for ReadReq accesses
 system.cpu.icache.ReadReq_misses                  611                       # number of ReadReq misses
 system.cpu.icache.ReadReq_mshr_hits               244                       # number of ReadReq MSHR hits
-system.cpu.icache.ReadReq_mshr_miss_latency      1652591                       # number of ReadReq MSHR miss cycles
+system.cpu.icache.ReadReq_mshr_miss_latency      1652848                       # number of ReadReq MSHR miss cycles
 system.cpu.icache.ReadReq_mshr_miss_rate     0.072173                       # mshr miss rate for ReadReq accesses
 system.cpu.icache.ReadReq_mshr_misses             367                       # number of ReadReq MSHR misses
 system.cpu.icache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
@ -178,29 +178,29 @@ system.cpu.icache.blocked_cycles_no_mshrs            0                       # n
 system.cpu.icache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
 system.cpu.icache.cache_copies                      0                       # number of cache copies performed
 system.cpu.icache.demand_accesses                5085                       # number of demand (read+write) accesses
-system.cpu.icache.demand_avg_miss_latency  5148.266776                       # average overall miss latency
-system.cpu.icache.demand_avg_mshr_miss_latency  4502.972752                       # average overall mshr miss latency
+system.cpu.icache.demand_avg_miss_latency  5150.152209                       # average overall miss latency
+system.cpu.icache.demand_avg_mshr_miss_latency  4503.673025                       # average overall mshr miss latency
 system.cpu.icache.demand_hits                    4474                       # number of demand (read+write) hits
-system.cpu.icache.demand_miss_latency         3145591                       # number of demand (read+write) miss cycles
+system.cpu.icache.demand_miss_latency         3146743                       # number of demand (read+write) miss cycles
 system.cpu.icache.demand_miss_rate           0.120157                       # miss rate for demand accesses
 system.cpu.icache.demand_misses                   611                       # number of demand (read+write) misses
 system.cpu.icache.demand_mshr_hits                244                       # number of demand (read+write) MSHR hits
-system.cpu.icache.demand_mshr_miss_latency      1652591                       # number of demand (read+write) MSHR miss cycles
+system.cpu.icache.demand_mshr_miss_latency      1652848                       # number of demand (read+write) MSHR miss cycles
 system.cpu.icache.demand_mshr_miss_rate      0.072173                       # mshr miss rate for demand accesses
 system.cpu.icache.demand_mshr_misses              367                       # number of demand (read+write) MSHR misses
 system.cpu.icache.fast_writes                       0                       # number of fast writes performed
 system.cpu.icache.mshr_cap_events                   0                       # number of times MSHR cap was activated
 system.cpu.icache.no_allocate_misses                0                       # Number of misses that were no-allocate
 system.cpu.icache.overall_accesses               5085                       # number of overall (read+write) accesses
-system.cpu.icache.overall_avg_miss_latency  5148.266776                       # average overall miss latency
-system.cpu.icache.overall_avg_mshr_miss_latency  4502.972752                       # average overall mshr miss latency
+system.cpu.icache.overall_avg_miss_latency  5150.152209                       # average overall miss latency
+system.cpu.icache.overall_avg_mshr_miss_latency  4503.673025                       # average overall mshr miss latency
 system.cpu.icache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
 system.cpu.icache.overall_hits                   4474                       # number of overall hits
-system.cpu.icache.overall_miss_latency        3145591                       # number of overall miss cycles
+system.cpu.icache.overall_miss_latency        3146743                       # number of overall miss cycles
 system.cpu.icache.overall_miss_rate          0.120157                       # miss rate for overall accesses
 system.cpu.icache.overall_misses                  611                       # number of overall misses
 system.cpu.icache.overall_mshr_hits               244                       # number of overall MSHR hits
-system.cpu.icache.overall_mshr_miss_latency      1652591                       # number of overall MSHR miss cycles
+system.cpu.icache.overall_mshr_miss_latency      1652848                       # number of overall MSHR miss cycles
 system.cpu.icache.overall_mshr_miss_rate     0.072173                       # mshr miss rate for overall accesses
 system.cpu.icache.overall_mshr_misses             367                       # number of overall MSHR misses
 system.cpu.icache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
@ -217,35 +217,35 @@ system.cpu.icache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.icache.replacements                      1                       # number of replacements
 system.cpu.icache.sampled_refs                    363                       # Sample count of references to valid blocks.
 system.cpu.icache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.icache.tagsinuse                172.869174                       # Cycle average of tags in use
+system.cpu.icache.tagsinuse                172.868641                       # Cycle average of tags in use
 system.cpu.icache.total_refs                     4474                       # Total number of references to valid blocks.
 system.cpu.icache.warmup_cycle                      0                       # Cycle when the warmup percentage was hit.
 system.cpu.icache.writebacks                        0                       # number of writebacks
-system.cpu.idleCycles                         1196701                       # Total number of cycles that the CPU has spent unscheduled due to idling
-system.cpu.iew.EXEC:branches                     3576                       # Number of branches executed
+system.cpu.idleCycles                         1197280                       # Total number of cycles that the CPU has spent unscheduled due to idling
+system.cpu.iew.EXEC:branches                     3577                       # Number of branches executed
 system.cpu.iew.EXEC:nop                             0                       # number of nop insts executed
-system.cpu.iew.EXEC:rate                     0.092548                       # Inst execution rate
-system.cpu.iew.EXEC:refs                         5257                       # number of memory reference insts executed
+system.cpu.iew.EXEC:rate                     0.092802                       # Inst execution rate
+system.cpu.iew.EXEC:refs                         5258                       # number of memory reference insts executed
 system.cpu.iew.EXEC:stores                       2386                       # Number of stores executed
 system.cpu.iew.EXEC:swp                             0                       # number of swp insts executed
 system.cpu.iew.WB:consumers                      9737                       # num instructions consuming a value
-system.cpu.iew.WB:count                         19769                       # cumulative count of insts written-back
+system.cpu.iew.WB:count                         19771                       # cumulative count of insts written-back
 system.cpu.iew.WB:fanout                     0.790901                       # average fanout of values written-back
 system.cpu.iew.WB:penalized                         0                       # number of instrctions required to write to 'other' IQ
 system.cpu.iew.WB:penalized_rate                    0                       # fraction of instructions written-back that wrote to 'other' IQ
 system.cpu.iew.WB:producers                      7701                       # num instructions producing a value
-system.cpu.iew.WB:rate                       0.088054                       # insts written-back per cycle
-system.cpu.iew.WB:sent                          20061                       # cumulative count of insts sent to commit
-system.cpu.iew.branchMispredicts                 2593                       # Number of branch mispredicts detected at execute
+system.cpu.iew.WB:rate                       0.088292                       # insts written-back per cycle
+system.cpu.iew.WB:sent                          20063                       # cumulative count of insts sent to commit
+system.cpu.iew.branchMispredicts                 2594                       # Number of branch mispredicts detected at execute
 system.cpu.iew.iewBlockCycles                     476                       # Number of cycles IEW is blocking
 system.cpu.iew.iewDispLoadInsts                  3250                       # Number of dispatched load instructions
 system.cpu.iew.iewDispNonSpecInsts                617                       # Number of dispatched non-speculative instructions
-system.cpu.iew.iewDispSquashedInsts              2705                       # Number of squashed instructions skipped by dispatch
+system.cpu.iew.iewDispSquashedInsts              2694                       # Number of squashed instructions skipped by dispatch
 system.cpu.iew.iewDispStoreInsts                 2817                       # Number of dispatched store instructions
 system.cpu.iew.iewDispatchedInsts               25240                       # Number of instructions dispatched to IQ
-system.cpu.iew.iewExecLoadInsts                  2871                       # Number of load instructions executed
-system.cpu.iew.iewExecSquashedInsts              1780                       # Number of squashed instructions skipped in execute
-system.cpu.iew.iewExecutedInsts                 20778                       # Number of executed instructions
+system.cpu.iew.iewExecLoadInsts                  2872                       # Number of load instructions executed
+system.cpu.iew.iewExecSquashedInsts              1777                       # Number of squashed instructions skipped in execute
+system.cpu.iew.iewExecutedInsts                 20781                       # Number of executed instructions
 system.cpu.iew.iewIQFullEvents                      7                       # Number of times the IQ has become full, causing a stall
 system.cpu.iew.iewIdleCycles                        0                       # Number of cycles IEW is idle
 system.cpu.iew.iewLSQFullEvents                     0                       # Number of times the LSQ has become full, causing a stall
@ -262,7 +262,7 @@ system.cpu.iew.lsq.thread.0.rescheduledLoads            0
 system.cpu.iew.lsq.thread.0.squashedLoads         1788                       # Number of loads squashed
 system.cpu.iew.lsq.thread.0.squashedStores         1519                       # Number of stores squashed
 system.cpu.iew.memOrderViolationEvents             54                       # Number of memory order violations
-system.cpu.iew.predictedNotTakenIncorrect          962                       # Number of branches that were predicted not taken incorrectly
+system.cpu.iew.predictedNotTakenIncorrect          963                       # Number of branches that were predicted not taken incorrectly
 system.cpu.iew.predictedTakenIncorrect           1631                       # Number of branches that were predicted taken incorrectly
 system.cpu.ipc                               0.007723                       # IPC: Instructions Per Cycle
 system.cpu.ipc_total                         0.007723                       # IPC: Total IPC of All Threads
@ -302,21 +302,21 @@ system.cpu.iq.ISSUE:fu_full.start_dist
                    InstPrefetch            0      0.00%            # attempts to use FU when none available
 system.cpu.iq.ISSUE:fu_full.end_dist
 system.cpu.iq.ISSUE:issued_per_cycle.start_dist                     # Number of insts issued each cycle
-system.cpu.iq.ISSUE:issued_per_cycle.samples       224511                      
+system.cpu.iq.ISSUE:issued_per_cycle.samples       223928                      
 system.cpu.iq.ISSUE:issued_per_cycle.min_value            0                      
-                               0       215315   9590.40%           
-                               1         4124    183.69%           
-                               2         1297     57.77%           
-                               3         1306     58.17%           
-                               4         1190     53.00%           
-                               5          707     31.49%           
-                               6          433     19.29%           
-                               7           83      3.70%           
-                               8           56      2.49%           
+                               0       214838   9594.07%           
+                               1         3976    177.56%           
+                               2         1244     55.55%           
+                               3         1359     60.69%           
+                               4         1316     58.77%           
+                               5          612     27.33%           
+                               6          444     19.83%           
+                               7           83      3.71%           
+                               8           56      2.50%           
 system.cpu.iq.ISSUE:issued_per_cycle.max_value            8                      
 system.cpu.iq.ISSUE:issued_per_cycle.end_dist

-system.cpu.iq.ISSUE:rate                     0.100476                       # Inst issue rate
+system.cpu.iq.ISSUE:rate                     0.100738                       # Inst issue rate
 system.cpu.iq.iqInstsAdded                      24623                       # Number of instructions added to the IQ (excludes non-spec)
 system.cpu.iq.iqInstsIssued                     22558                       # Number of instructions issued
 system.cpu.iq.iqNonSpecInstsAdded                 617                       # Number of non-speculative instructions added to the IQ
@ -325,12 +325,12 @@ system.cpu.iq.iqSquashedInstsIssued               174                       # Nu
 system.cpu.iq.iqSquashedNonSpecRemoved            290                       # Number of squashed non-spec instructions that were removed
 system.cpu.iq.iqSquashedOperandsExamined         5834                       # Number of squashed operands that are examined and possibly removed from graph
 system.cpu.l2cache.ReadReq_accesses               513                       # number of ReadReq accesses(hits+misses)
-system.cpu.l2cache.ReadReq_avg_miss_latency  4754.779727                       # average ReadReq miss latency
-system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2343.506823                       # average ReadReq mshr miss latency
-system.cpu.l2cache.ReadReq_miss_latency       2439202                       # number of ReadReq miss cycles
+system.cpu.l2cache.ReadReq_avg_miss_latency  4755.715400                       # average ReadReq miss latency
+system.cpu.l2cache.ReadReq_avg_mshr_miss_latency  2343.752437                       # average ReadReq mshr miss latency
+system.cpu.l2cache.ReadReq_miss_latency       2439682                       # number of ReadReq miss cycles
 system.cpu.l2cache.ReadReq_miss_rate                1                       # miss rate for ReadReq accesses
 system.cpu.l2cache.ReadReq_misses                 513                       # number of ReadReq misses
-system.cpu.l2cache.ReadReq_mshr_miss_latency      1202219                       # number of ReadReq MSHR miss cycles
+system.cpu.l2cache.ReadReq_mshr_miss_latency      1202345                       # number of ReadReq MSHR miss cycles
 system.cpu.l2cache.ReadReq_mshr_miss_rate            1                       # mshr miss rate for ReadReq accesses
 system.cpu.l2cache.ReadReq_mshr_misses            513                       # number of ReadReq MSHR misses
 system.cpu.l2cache.avg_blocked_cycles_no_mshrs <err: div-0>                       # average number of cycles each access was blocked
@ -342,29 +342,29 @@ system.cpu.l2cache.blocked_cycles_no_mshrs            0                       #
 system.cpu.l2cache.blocked_cycles_no_targets            0                       # number of cycles access was blocked
 system.cpu.l2cache.cache_copies                     0                       # number of cache copies performed
 system.cpu.l2cache.demand_accesses                513                       # number of demand (read+write) accesses
-system.cpu.l2cache.demand_avg_miss_latency  4754.779727                       # average overall miss latency
-system.cpu.l2cache.demand_avg_mshr_miss_latency  2343.506823                       # average overall mshr miss latency
+system.cpu.l2cache.demand_avg_miss_latency  4755.715400                       # average overall miss latency
+system.cpu.l2cache.demand_avg_mshr_miss_latency  2343.752437                       # average overall mshr miss latency
 system.cpu.l2cache.demand_hits                      0                       # number of demand (read+write) hits
-system.cpu.l2cache.demand_miss_latency        2439202                       # number of demand (read+write) miss cycles
+system.cpu.l2cache.demand_miss_latency        2439682                       # number of demand (read+write) miss cycles
 system.cpu.l2cache.demand_miss_rate                 1                       # miss rate for demand accesses
 system.cpu.l2cache.demand_misses                  513                       # number of demand (read+write) misses
 system.cpu.l2cache.demand_mshr_hits                 0                       # number of demand (read+write) MSHR hits
-system.cpu.l2cache.demand_mshr_miss_latency      1202219                       # number of demand (read+write) MSHR miss cycles
+system.cpu.l2cache.demand_mshr_miss_latency      1202345                       # number of demand (read+write) MSHR miss cycles
 system.cpu.l2cache.demand_mshr_miss_rate            1                       # mshr miss rate for demand accesses
 system.cpu.l2cache.demand_mshr_misses             513                       # number of demand (read+write) MSHR misses
 system.cpu.l2cache.fast_writes                      0                       # number of fast writes performed
 system.cpu.l2cache.mshr_cap_events                  0                       # number of times MSHR cap was activated
 system.cpu.l2cache.no_allocate_misses               0                       # Number of misses that were no-allocate
 system.cpu.l2cache.overall_accesses               513                       # number of overall (read+write) accesses
-system.cpu.l2cache.overall_avg_miss_latency  4754.779727                       # average overall miss latency
-system.cpu.l2cache.overall_avg_mshr_miss_latency  2343.506823                       # average overall mshr miss latency
+system.cpu.l2cache.overall_avg_miss_latency  4755.715400                       # average overall miss latency
+system.cpu.l2cache.overall_avg_mshr_miss_latency  2343.752437                       # average overall mshr miss latency
 system.cpu.l2cache.overall_avg_mshr_uncacheable_latency <err: div-0>                       # average overall mshr uncacheable latency
 system.cpu.l2cache.overall_hits                     0                       # number of overall hits
-system.cpu.l2cache.overall_miss_latency       2439202                       # number of overall miss cycles
+system.cpu.l2cache.overall_miss_latency       2439682                       # number of overall miss cycles
 system.cpu.l2cache.overall_miss_rate                1                       # miss rate for overall accesses
 system.cpu.l2cache.overall_misses                 513                       # number of overall misses
 system.cpu.l2cache.overall_mshr_hits                0                       # number of overall MSHR hits
-system.cpu.l2cache.overall_mshr_miss_latency      1202219                       # number of overall MSHR miss cycles
+system.cpu.l2cache.overall_mshr_miss_latency      1202345                       # number of overall MSHR miss cycles
 system.cpu.l2cache.overall_mshr_miss_rate            1                       # mshr miss rate for overall accesses
 system.cpu.l2cache.overall_mshr_misses            513                       # number of overall MSHR misses
 system.cpu.l2cache.overall_mshr_uncacheable_latency            0                       # number of overall MSHR uncacheable cycles
@ -381,28 +381,27 @@ system.cpu.l2cache.prefetcher.num_hwpf_squashed_from_miss            0
 system.cpu.l2cache.replacements                     0                       # number of replacements
 system.cpu.l2cache.sampled_refs                   512                       # Sample count of references to valid blocks.
 system.cpu.l2cache.soft_prefetch_mshr_full            0                       # number of mshr full events for SW prefetching instrutions
-system.cpu.l2cache.tagsinuse               262.946375                       # Cycle average of tags in use
+system.cpu.l2cache.tagsinuse               262.945674                       # Cycle average of tags in use
 system.cpu.l2cache.total_refs                       0                       # Total number of references to valid blocks.
 system.cpu.l2cache.warmup_cycle                     0                       # Cycle when the warmup percentage was hit.
 system.cpu.l2cache.writebacks                       0                       # number of writebacks
-system.cpu.numCycles                           224511                       # number of cpu cycles simulated
+system.cpu.numCycles                           223928                       # number of cpu cycles simulated
 system.cpu.rename.RENAME:BlockCycles              960                       # Number of cycles rename is blocking
 system.cpu.rename.RENAME:CommittedMaps           9868                       # Number of HB maps that are committed
 system.cpu.rename.RENAME:IQFullEvents               2                       # Number of times rename has blocked due to IQ full
-system.cpu.rename.RENAME:IdleCycles             20098                       # Number of cycles rename is idle
-system.cpu.rename.RENAME:LSQFullEvents            481                       # Number of times rename has blocked due to LSQ full
+system.cpu.rename.RENAME:IdleCycles             21302                       # Number of cycles rename is idle
+system.cpu.rename.RENAME:LSQFullEvents            411                       # Number of times rename has blocked due to LSQ full
 system.cpu.rename.RENAME:ROBFullEvents              4                       # Number of times rename has blocked due to ROB full
 system.cpu.rename.RENAME:RenameLookups          46931                       # Number of register rename lookups that rename has made
-system.cpu.rename.RENAME:RenamedInsts           31260                       # Number of instructions processed by rename
+system.cpu.rename.RENAME:RenamedInsts           31249                       # Number of instructions processed by rename
 system.cpu.rename.RENAME:RenamedOperands        25831                       # Number of destination operands rename has renamed
-system.cpu.rename.RENAME:RunCycles               7921                       # Number of cycles rename is running
+system.cpu.rename.RENAME:RunCycles               7136                       # Number of cycles rename is running
 system.cpu.rename.RENAME:SquashCycles            3162                       # Number of cycles rename is squashing
-system.cpu.rename.RENAME:SquashedInsts           8042                       # Number of squashed instructions processed by rename
-system.cpu.rename.RENAME:UnblockCycles           1212                       # Number of cycles rename is unblocking
+system.cpu.rename.RENAME:UnblockCycles            614                       # Number of cycles rename is unblocking
 system.cpu.rename.RENAME:UndoneMaps             15963                       # Number of HB maps that are undone due to squashing
-system.cpu.rename.RENAME:serializeStallCycles       190573                       # count of cycles rename stalled for serializing inst
+system.cpu.rename.RENAME:serializeStallCycles       190754                       # count of cycles rename stalled for serializing inst
 system.cpu.rename.RENAME:serializingInsts          638                       # count of serializing insts renamed
-system.cpu.rename.RENAME:skidInsts               5594                       # count of insts added to the skid buffer
+system.cpu.rename.RENAME:skidInsts               5529                       # count of insts added to the skid buffer
 system.cpu.rename.RENAME:tempSerializingInsts          629                       # count of temporary serializing insts renamed
 system.cpu.timesIdled                             289                       # Number of times that the entire CPU went into an idle state and unscheduled itself
 system.cpu.workload.PROG:num_syscalls               8                       # Number of system calls
--- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stderr
@ -1,4 +1,3 @@
 warn: More than two loadable segments in ELF object.
 warn: Ignoring segment @ 0x0 length 0x0.
-0: system.remote_gdb.listener: listening for remote gdb on port 7003
 warn: Entering event queue @ 0.  Starting simulation...
--- a/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout
+++ b/tests/quick/02.insttest/ref/sparc/linux/o3-timing/stdout
@ -16,9 +16,9 @@ The Regents of The University of Michigan
 All Rights Reserved


-M5 compiled Apr  9 2007 03:06:26
-M5 started Mon Apr  9 03:06:54 2007
-M5 executing on zizzer.eecs.umich.edu
+M5 compiled Apr 13 2007 13:56:34
+M5 started Fri Apr 13 13:56:35 2007
+M5 executing on ahchoo.blinky.homelinux.org
 command line: build/SPARC_SE/m5.fast -d build/SPARC_SE/tests/fast/quick/02.insttest/sparc/linux/o3-timing tests/run.py quick/02.insttest/sparc/linux/o3-timing
 Global frequency set at 1000000000000 ticks per second
-Exiting @ tick 1421211 because target called exit()
+Exiting @ tick 1421207 because target called exit()