Check in of various updates to the CPU. Mainly adds in stats, improves

branch prediction, and makes memory dependence work properly. SConscript: Added return address stack, tournament predictor. cpu/base_cpu.cc: Added debug break and print statements. cpu/base_dyn_inst.cc: cpu/base_dyn_inst.hh: Comment out possibly unneeded variables. cpu/beta_cpu/2bit_local_pred.cc: 2bit predictor no longer speculatively updates itself. cpu/beta_cpu/alpha_dyn_inst.hh: Comment formatting. cpu/beta_cpu/alpha_full_cpu.hh: Formatting cpu/beta_cpu/alpha_full_cpu_builder.cc: Added new parameters for branch predictors, and IQ parameters. cpu/beta_cpu/alpha_full_cpu_impl.hh: Register stats. cpu/beta_cpu/alpha_params.hh: Added parameters for IQ, branch predictors, and store sets. cpu/beta_cpu/bpred_unit.cc: Removed one class. cpu/beta_cpu/bpred_unit.hh: Add in RAS, stats. Changed branch predictor unit functionality so that it holds a history of past branches so it can update, and also hold a proper history of the RAS so it can be restored on branch mispredicts. cpu/beta_cpu/bpred_unit_impl.hh: Added in stats, history of branches, RAS. Now bpred unit actually modifies the instruction's predicted next PC. cpu/beta_cpu/btb.cc: Add in sanity checks. cpu/beta_cpu/comm.hh: Add in communication where needed, remove it where it's not. cpu/beta_cpu/commit.hh: cpu/beta_cpu/rename.hh: cpu/beta_cpu/rename_impl.hh: Add in stats. cpu/beta_cpu/commit_impl.hh: Stats, update what is sent back on branch mispredict. cpu/beta_cpu/cpu_policy.hh: Change the bpred unit being used. cpu/beta_cpu/decode.hh: cpu/beta_cpu/decode_impl.hh: Stats. cpu/beta_cpu/fetch.hh: Stats, change squash so it can handle squashes from decode differently than squashes from commit. cpu/beta_cpu/fetch_impl.hh: Add in stats. Change how a cache line is fetched. Update to work with caches. Also have separate functions for different behavior if squash is coming from decode vs commit. cpu/beta_cpu/free_list.hh: Remove some old comments. cpu/beta_cpu/full_cpu.cc: cpu/beta_cpu/full_cpu.hh: Added function to remove instructions from back of instruction list until a certain sequence number. cpu/beta_cpu/iew.hh: Stats, separate squashing behavior due to branches vs memory. cpu/beta_cpu/iew_impl.hh: Stats, separate squashing behavior for branches vs memory. cpu/beta_cpu/inst_queue.cc: Debug stuff cpu/beta_cpu/inst_queue.hh: Stats, change how mem dep unit works, debug stuff cpu/beta_cpu/inst_queue_impl.hh: Stats, change how mem dep unit works, debug stuff. Also add in parameters that used to be hardcoded. cpu/beta_cpu/mem_dep_unit.hh: cpu/beta_cpu/mem_dep_unit_impl.hh: Add in stats, change how memory dependence unit works. It now holds the memory instructions that are waiting for their memory dependences to resolve. It provides which instructions are ready directly to the IQ. cpu/beta_cpu/regfile.hh: Fix up sanity checks. cpu/beta_cpu/rename_map.cc: Fix loop variable type. cpu/beta_cpu/rob_impl.hh: Remove intermediate DynInstPtr cpu/beta_cpu/store_set.cc: Add in debugging statements. cpu/beta_cpu/store_set.hh: Reorder function arguments to match the rest of the calls. --HG-- extra : convert_revision : aabf9b1fecd1d743265dfc3b174d6159937c6f44
2004-10-21 18:02:36 -04:00 · 2004-10-21 18:02:36 -04:00 · 2fb632dbda
parent e3fb9afa79
commit 2fb632dbda
43 changed files with 2769 additions and 819 deletions
--- a/4
+++ b/4
@ -106,10 +106,12 @@ base_sources = Split('''
        cpu/beta_cpu/inst_queue.cc
        cpu/beta_cpu/ldstq.cc
        cpu/beta_cpu/mem_dep_unit.cc
+        cpu/beta_cpu/ras.cc
        cpu/beta_cpu/rename.cc
        cpu/beta_cpu/rename_map.cc
        cpu/beta_cpu/rob.cc
        cpu/beta_cpu/store_set.cc
+        cpu/beta_cpu/tournament_pred.cc
 	cpu/fast_cpu/fast_cpu.cc
 	cpu/full_cpu/bpred.cc
 	cpu/full_cpu/commit.cc
@ -481,7 +483,7 @@ env.Append(CPPPATH='.')

 # Debug binary
 debug = env.Copy(OBJSUFFIX='.do')
-debug.Append(CCFLAGS=Split('-g -gstabs+ -O0'))
+debug.Append(CCFLAGS=Split('-g -gstabs+ -O0 -lefence'))
 debug.Append(CPPDEFINES='DEBUG')
 debug.Program(target = 'm5.debug', source = make_objs(sources, debug))

--- a/cpu/base_cpu.cc
+++ b/cpu/base_cpu.cc
@ -37,6 +37,8 @@
 #include "sim/param.hh"
 #include "sim/sim_events.hh"

+#include "base/trace.hh"
+
 using namespace std;

 vector<BaseCPU *> BaseCPU::cpuList;
@ -46,6 +48,7 @@ vector<BaseCPU *> BaseCPU::cpuList;
 // been initialized
 int maxThreadsPerCPU = 1;

+extern void debug_break();
 #ifdef FULL_SYSTEM
 BaseCPU::BaseCPU(const string &_name, int _number_of_threads,
                 Counter max_insts_any_thread,
@ -64,9 +67,16 @@ BaseCPU::BaseCPU(const string &_name, int _number_of_threads,
    : SimObject(_name), number_of_threads(_number_of_threads)
 #endif
 {
+    DPRINTF(FullCPU, "BaseCPU: Creating object, mem address %#x.\n", this);
+
+    debug_break();
+
    // add self to global list of CPUs
    cpuList.push_back(this);

+    DPRINTF(FullCPU, "BaseCPU: CPU added to cpuList, mem address %#x.\n",
+            this);
+
    if (number_of_threads > maxThreadsPerCPU)
        maxThreadsPerCPU = number_of_threads;

--- a/cpu/base_dyn_inst.cc
+++ b/cpu/base_dyn_inst.cc
@ -83,7 +83,7 @@ BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,

    seqNum = seq_num;

-    specMemWrite = false;
+//    specMemWrite = false;

    canIssue = false;
    issued = false;
@ -95,7 +95,7 @@ BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,
    blockingInst = false;
    recoverInst = false;
    specMode = false;
-    btbMissed = false;
+//    btbMissed = false;
    // Eventually make this a parameter.
    threadNumber = 0;
    // Also make this a parameter.
@ -139,12 +139,12 @@ BaseDynInst<Impl>::BaseDynInst(StaticInstPtr<ISA> &_staticInst)
    effAddr = MemReq::inval_addr;
    physEffAddr = MemReq::inval_addr;

-    specMemWrite = false;
+//    specMemWrite = false;

    blockingInst = false;
    recoverInst = false;
    specMode = false;
-    btbMissed = false;
+//    btbMissed = false;

    // Make sure to have the renamed register entries set to the same
    // as the normal register entries.  It will allow the IQ to work
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@ -146,7 +146,10 @@ class BaseDynInst : public FastAlloc, public RefCounted
    bool threadsyncWait;

    /** If the BTB missed. */
-    bool btbMissed;
+//    bool btbMissed;
+
+    /** The global history of this instruction (branch). */
+//    unsigned globalHistory;

    /** The thread this instruction is from. */
    short threadNumber;
@ -212,7 +215,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
    static int instcount;

    /** Did this instruction do a spec write? */
-    bool specMemWrite;
+//    bool specMemWrite;

  private:
    /** Physical register index of the destination registers of this
@ -287,15 +290,22 @@ class BaseDynInst : public FastAlloc, public RefCounted

    /** Returns whether the instruction was predicted taken or not. */
    bool predTaken() {
-//        DPRINTF(FullCPU, "PC: %08p\n", PC);
-//        DPRINTF(FullCPU, "predPC: %08p\n", predPC);
-
        return( predPC != (PC + sizeof(MachInst) ) );
    }

    /** Returns whether the instruction mispredicted. */
    bool mispredicted() { return (predPC != nextPC); }

+/*
+    unsigned readGlobalHist() {
+        return globalHistory;
+    }
+
+    void setGlobalHist(unsigned history) {
+        globalHistory = history;
+    }
+*/
+
    //
    //  Instruction types.  Forward checks to StaticInst object.
    //
@ -452,7 +462,7 @@ class BaseDynInst : public FastAlloc, public RefCounted
    OpClass opClass() const { return staticInst->opClass(); }

    /** Returns whether or not the BTB missed. */
-    bool btbMiss() const { return btbMissed; }
+//    bool btbMiss() const { return btbMissed; }

    /** Returns the branch target address. */
    Addr branchTarget() const { return staticInst->branchTarget(PC); }
@ -579,8 +589,8 @@ BaseDynInst<Impl>::write(T data, Addr addr, unsigned flags, uint64_t *res)

    storeSize = sizeof(T);
    storeData = data;
-    if (specMode)
-        specMemWrite = true;
+//    if (specMode)
+//	specMemWrite = true;

    MemReqPtr req = new MemReq(addr, xc, sizeof(T), flags);

--- a/cpu/beta_cpu/2bit_local_pred.cc
+++ b/cpu/beta_cpu/2bit_local_pred.cc
@ -75,18 +75,34 @@ DefaultBP::getLocalIndex(Addr &branch_addr)
 bool
 DefaultBP::lookup(Addr &branch_addr)
 {
+    bool taken;
    uint8_t local_prediction;
    unsigned local_predictor_idx = getLocalIndex(branch_addr);

    DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
            local_predictor_idx);

+    assert(local_predictor_idx < localPredictorSize);
+
    local_prediction = localCtrs[local_predictor_idx].read();

    DPRINTF(Fetch, "Branch predictor: prediction is %i.\n",
            (int)local_prediction);

-    return getPrediction(local_prediction);
+    taken = getPrediction(local_prediction);
+
+#if 0
+    // Speculative update.
+    if (taken) {
+        DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
+        localCtrs[local_predictor_idx].increment();
+    } else {
+        DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n");
+        localCtrs[local_predictor_idx].decrement();
+    }
+#endif
+
+    return taken;
 }

 void
@ -100,11 +116,17 @@ DefaultBP::update(Addr &branch_addr, bool taken)
    DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
            local_predictor_idx);

+    assert(local_predictor_idx < localPredictorSize);
+
+    // Increment or decrement twice to undo speculative update, then
+    // properly update
    if (taken) {
        DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
        localCtrs[local_predictor_idx].increment();
+//        localCtrs[local_predictor_idx].increment();
    } else {
        DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n");
        localCtrs[local_predictor_idx].decrement();
+//        localCtrs[local_predictor_idx].decrement();
    }
 }
--- a/cpu/beta_cpu/alpha_dyn_inst.hh
+++ b/cpu/beta_cpu/alpha_dyn_inst.hh
@ -19,19 +19,19 @@ template <class Impl>
 class AlphaDynInst : public BaseDynInst<Impl>
 {
  public:
-    // Typedef for the CPU.
+    /** Typedef for the CPU. */
    typedef typename Impl::FullCPU FullCPU;

-    //Typedef to get the ISA.
+    /** Typedef to get the ISA. */
    typedef typename Impl::ISA ISA;

-    /// Binary machine instruction type.
+    /** Binary machine instruction type. */
    typedef typename ISA::MachInst MachInst;
-    /// Memory address type.
+    /** Memory address type. */
    typedef typename ISA::Addr	   Addr;
-    /// Logical register index type.
+    /** Logical register index type. */
    typedef typename ISA::RegIndex RegIndex;
-    /// Integer register index type.
+    /** Integer register index type. */
    typedef typename ISA::IntReg   IntReg;

    enum {
@ -54,6 +54,7 @@ class AlphaDynInst : public BaseDynInst<Impl>
        return fault;
    }

+  public:
    uint64_t readUniq();
    void setUniq(uint64_t val);

--- a/cpu/beta_cpu/alpha_full_cpu.hh
+++ b/cpu/beta_cpu/alpha_full_cpu.hh
@ -29,6 +29,8 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
 #endif

  public:
+    void regStats();
+
 #ifdef FULL_SYSTEM
    bool inPalMode();

@ -66,14 +68,17 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
        return No_Fault;
    }
+
    Fault translateInstReq(MemReqPtr &req)
    {
        return dummyTranslation(req);
    }
+
    Fault translateDataReadReq(MemReqPtr &req)
    {
        return dummyTranslation(req);
    }
+
    Fault translateDataWriteReq(MemReqPtr &req)
    {
        return dummyTranslation(req);
@ -81,73 +86,6 @@ class AlphaFullCPU : public FullBetaCPU<Impl>

 #endif

-    template <class T>
-    Fault read(MemReqPtr &req, T &data)
-    {
-#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
-        if (req->flags & LOCKED) {
-            MiscRegFile *cregs = &req->xc->regs.miscRegs;
-            cregs->lock_addr = req->paddr;
-            cregs->lock_flag = true;
-        }
-#endif
-
-        Fault error;
-        error = mem->read(req, data);
-        data = htoa(data);
-        return error;
-    }
-
-    template <class T>
-    Fault write(MemReqPtr &req, T &data)
-    {
-#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
-
-        MiscRegFile *cregs;
-
-        // If this is a store conditional, act appropriately
-        if (req->flags & LOCKED) {
-            cregs = &xc->regs.miscRegs;
-
-            if (req->flags & UNCACHEABLE) {
-                // Don't update result register (see stq_c in isa_desc)
-                req->result = 2;
-                req->xc->storeCondFailures = 0;//Needed? [RGD]
-            } else {
-                req->result = cregs->lock_flag;
-                if (!cregs->lock_flag ||
-                    ((cregs->lock_addr & ~0xf) != (req->paddr & ~0xf))) {
-                    cregs->lock_flag = false;
-                    if (((++req->xc->storeCondFailures) % 100000) == 0) {
-                        std::cerr << "Warning: "
-                                  << req->xc->storeCondFailures
-                                  << " consecutive store conditional failures "
-                                  << "on cpu " << cpu_id
-                                  << std::endl;
-                    }
-                    return No_Fault;
-                }
-                else req->xc->storeCondFailures = 0;
-            }
-        }
-
-        // Need to clear any locked flags on other proccessors for
-        // this address.  Only do this for succsful Store Conditionals
-        // and all other stores (WH64?).  Unsuccessful Store
-        // Conditionals would have returned above, and wouldn't fall
-        // through.
-        for (int i = 0; i < system->execContexts.size(); i++){
-            cregs = &system->execContexts[i]->regs.miscRegs;
-            if ((cregs->lock_addr & ~0xf) == (req->paddr & ~0xf)) {
-                cregs->lock_flag = false;
-            }
-        }
-
-#endif
-
-        return mem->write(req, (T)htoa(data));
-    }
-
    // Later on may want to remove this misc stuff from the regfile and
    // have it handled at this level.  Might prove to be an issue when
    // trying to rename source/destination registers...
@ -240,6 +178,76 @@ class AlphaFullCPU : public FullBetaCPU<Impl>
    // Called by initCPU.  Implement as I please.
    void initIPRs(RegFile *regs);
 #endif
+
+
+    template <class T>
+    Fault read(MemReqPtr &req, T &data)
+    {
+#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
+        if (req->flags & LOCKED) {
+            MiscRegFile *cregs = &req->xc->regs.miscRegs;
+            cregs->lock_addr = req->paddr;
+            cregs->lock_flag = true;
+        }
+#endif
+
+        Fault error;
+        error = mem->read(req, data);
+        data = htoa(data);
+        return error;
+    }
+
+
+    template <class T>
+    Fault write(MemReqPtr &req, T &data)
+    {
+#if defined(TARGET_ALPHA) && defined(FULL_SYSTEM)
+
+        MiscRegFile *cregs;
+
+        // If this is a store conditional, act appropriately
+        if (req->flags & LOCKED) {
+            cregs = &xc->regs.miscRegs;
+
+            if (req->flags & UNCACHEABLE) {
+                // Don't update result register (see stq_c in isa_desc)
+                req->result = 2;
+                req->xc->storeCondFailures = 0;//Needed? [RGD]
+            } else {
+                req->result = cregs->lock_flag;
+                if (!cregs->lock_flag ||
+                    ((cregs->lock_addr & ~0xf) != (req->paddr & ~0xf))) {
+                    cregs->lock_flag = false;
+                    if (((++req->xc->storeCondFailures) % 100000) == 0) {
+                        std::cerr << "Warning: "
+                                  << req->xc->storeCondFailures
+                                  << " consecutive store conditional failures "
+                                  << "on cpu " << cpu_id
+                                  << std::endl;
+                    }
+                    return No_Fault;
+                }
+                else req->xc->storeCondFailures = 0;
+            }
+        }
+
+        // Need to clear any locked flags on other proccessors for
+        // this address.  Only do this for succsful Store Conditionals
+        // and all other stores (WH64?).  Unsuccessful Store
+        // Conditionals would have returned above, and wouldn't fall
+        // through.
+        for (int i = 0; i < system->execContexts.size(); i++){
+            cregs = &system->execContexts[i]->regs.miscRegs;
+            if ((cregs->lock_addr & ~0xf) == (req->paddr & ~0xf)) {
+                cregs->lock_flag = false;
+            }
+        }
+
+#endif
+
+        return mem->write(req, (T)htoa(data));
+    }
+
 };

 #endif // __ALPHA_FULL_CPU_HH__
--- a/cpu/beta_cpu/alpha_full_cpu_builder.cc
+++ b/cpu/beta_cpu/alpha_full_cpu_builder.cc
@ -81,17 +81,38 @@ Param<unsigned> issueWidth;
 Param<unsigned> executeWidth;
 Param<unsigned> executeIntWidth;
 Param<unsigned> executeFloatWidth;
+Param<unsigned> executeBranchWidth;
+Param<unsigned> executeMemoryWidth;

 Param<unsigned> iewToCommitDelay;
 Param<unsigned> renameToROBDelay;
 Param<unsigned> commitWidth;
 Param<unsigned> squashWidth;

+#if 0
 Param<unsigned> localPredictorSize;
 Param<unsigned> localPredictorCtrBits;
+#endif
+Param<unsigned> local_predictor_size;
+Param<unsigned> local_ctr_bits;
+Param<unsigned> local_history_table_size;
+Param<unsigned> local_history_bits;
+Param<unsigned> global_predictor_size;
+Param<unsigned> global_ctr_bits;
+Param<unsigned> global_history_bits;
+Param<unsigned> choice_predictor_size;
+Param<unsigned> choice_ctr_bits;
+
 Param<unsigned> BTBEntries;
 Param<unsigned> BTBTagSize;

+Param<unsigned> RASSize;
+
+Param<unsigned> LQEntries;
+Param<unsigned> SQEntries;
+Param<unsigned> LFSTSize;
+Param<unsigned> SSITSize;
+
 Param<unsigned> numPhysIntRegs;
 Param<unsigned> numPhysFloatRegs;
 Param<unsigned> numIQEntries;
@ -168,6 +189,8 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU)
    INIT_PARAM(executeWidth, "Execute width"),
    INIT_PARAM(executeIntWidth, "Integer execute width"),
    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
+    INIT_PARAM(executeBranchWidth, "Branch execute width"),
+    INIT_PARAM(executeMemoryWidth, "Memory execute width"),

    INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
               "delay"),
@ -175,12 +198,30 @@ BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU)
    INIT_PARAM(commitWidth, "Commit width"),
    INIT_PARAM(squashWidth, "Squash width"),

+#if 0
    INIT_PARAM(localPredictorSize, "Size of the local predictor in entries. "
               "Must be a power of 2."),
    INIT_PARAM(localPredictorCtrBits, "Number of bits per counter for bpred"),
+#endif
+    INIT_PARAM(local_predictor_size, "Size of local predictor"),
+    INIT_PARAM(local_ctr_bits, "Bits per counter"),
+    INIT_PARAM(local_history_table_size, "Size of local history table"),
+    INIT_PARAM(local_history_bits, "Bits for the local history"),
+    INIT_PARAM(global_predictor_size, "Size of global predictor"),
+    INIT_PARAM(global_ctr_bits, "Bits per counter"),
+    INIT_PARAM(global_history_bits, "Bits of history"),
+    INIT_PARAM(choice_predictor_size, "Size of choice predictor"),
+    INIT_PARAM(choice_ctr_bits, "Bits of choice counters"),
+
    INIT_PARAM(BTBEntries, "Number of BTB entries"),
    INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),

+    INIT_PARAM(RASSize, "RAS size"),
+
+    INIT_PARAM(LQEntries, "Number of load queue entries"),
+    INIT_PARAM(SQEntries, "Number of store queue entries"),
+    INIT_PARAM(LFSTSize, "Last fetched store table size"),
+    INIT_PARAM(SSITSize, "Store set ID table size"),

    INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"),
    INIT_PARAM(numPhysFloatRegs, "Number of physical floating point "
@ -277,17 +318,37 @@ CREATE_SIM_OBJECT(BaseFullCPU)
    params.executeWidth = executeWidth;
    params.executeIntWidth = executeIntWidth;
    params.executeFloatWidth = executeFloatWidth;
+    params.executeBranchWidth = executeBranchWidth;
+    params.executeMemoryWidth = executeMemoryWidth;

    params.iewToCommitDelay = iewToCommitDelay;
    params.renameToROBDelay = renameToROBDelay;
    params.commitWidth = commitWidth;
    params.squashWidth = squashWidth;
-
+#if 0
    params.localPredictorSize = localPredictorSize;
    params.localPredictorCtrBits = localPredictorCtrBits;
+#endif
+    params.local_predictor_size = local_predictor_size;
+    params.local_ctr_bits = local_ctr_bits;
+    params.local_history_table_size = local_history_table_size;
+    params.local_history_bits = local_history_bits;
+    params.global_predictor_size = global_predictor_size;
+    params.global_ctr_bits = global_ctr_bits;
+    params.global_history_bits = global_history_bits;
+    params.choice_predictor_size = choice_predictor_size;
+    params.choice_ctr_bits = choice_ctr_bits;
+
    params.BTBEntries = BTBEntries;
    params.BTBTagSize = BTBTagSize;

+    params.RASSize = RASSize;
+
+    params.LQEntries = LQEntries;
+    params.SQEntries = SQEntries;
+    params.SSITSize = SSITSize;
+    params.LFSTSize = LFSTSize;
+
    params.numPhysIntRegs = numPhysIntRegs;
    params.numPhysFloatRegs = numPhysFloatRegs;
    params.numIQEntries = numIQEntries;
--- a/cpu/beta_cpu/alpha_full_cpu_impl.hh
+++ b/cpu/beta_cpu/alpha_full_cpu_impl.hh
@ -27,6 +27,19 @@ AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
    rob.setCPU(this);
 }

+template <class Impl>
+void
+AlphaFullCPU<Impl>::regStats()
+{
+    // Register stats for everything that has stats.
+    fullCPURegStats();
+    fetch.regStats();
+    decode.regStats();
+    rename.regStats();
+    iew.regStats();
+    commit.regStats();
+}
+
 #ifndef FULL_SYSTEM

 template <class Impl>
@ -92,6 +105,14 @@ AlphaFullCPU<Impl>::squashStages()

    rob.squash(rob_head);
    commit.setSquashing();
+
+    // Now hack the time buffer to clear the sequence numbers in the places
+    // where the stages might read it.?
+    for (int i = 0; i < 5; ++i)
+    {
+        timeBuffer.access(-i)->commitInfo.doneSeqNum = 0;
+    }
+
 }

 #endif // FULL_SYSTEM
@ -178,7 +199,7 @@ template <class Impl>
 uint64_t *
 AlphaFullCPU<Impl>::getIpr()
 {
-    return regs.ipr;
+    return regFile.getIpr();
 }

 template <class Impl>
@ -564,7 +585,7 @@ AlphaFullCPU<Impl>::setIntrFlag(int val)
    regs.intrflag = val;
 }

-// Maybe have this send back from IEW stage to squash and update PC.
+// Can force commit stage to squash and stuff.
 template <class Impl>
 Fault
 AlphaFullCPU<Impl>::hwrei()
--- a/cpu/beta_cpu/alpha_params.hh
+++ b/cpu/beta_cpu/alpha_params.hh
@ -72,6 +72,8 @@ class AlphaSimpleParams : public BaseFullCPU::Params
    unsigned executeWidth;
    unsigned executeIntWidth;
    unsigned executeFloatWidth;
+    unsigned executeBranchWidth;
+    unsigned executeMemoryWidth;

    //
    // Commit
@ -84,17 +86,38 @@ class AlphaSimpleParams : public BaseFullCPU::Params
    //
    // Branch predictor (BP & BTB)
    //
+/*
    unsigned localPredictorSize;
    unsigned localPredictorCtrBits;
+*/
+
+    unsigned local_predictor_size;
+    unsigned local_ctr_bits;
+    unsigned local_history_table_size;
+    unsigned local_history_bits;
+    unsigned global_predictor_size;
+    unsigned global_ctr_bits;
+    unsigned global_history_bits;
+    unsigned choice_predictor_size;
+    unsigned choice_ctr_bits;
+
    unsigned BTBEntries;
    unsigned BTBTagSize;

+    unsigned RASSize;
+
    //
    // Load store queue
    //
    unsigned LQEntries;
    unsigned SQEntries;

+    //
+    // Memory dependence
+    //
+    unsigned SSITSize;
+    unsigned LFSTSize;
+
    //
    // Miscellaneous
    //
--- a/cpu/beta_cpu/bpred_unit.cc
+++ b/cpu/beta_cpu/bpred_unit.cc
@ -1,5 +1,6 @@

 #include "cpu/beta_cpu/bpred_unit_impl.hh"
 #include "cpu/beta_cpu/alpha_impl.hh"
+#include "cpu/beta_cpu/alpha_dyn_inst.hh"

-template DefaultBPredUnit<AlphaSimpleImpl>;
+template TwobitBPredUnit<AlphaSimpleImpl>;
--- a/cpu/beta_cpu/bpred_unit.hh
+++ b/cpu/beta_cpu/bpred_unit.hh
@ -4,9 +4,15 @@

 // For Addr type.
 #include "arch/alpha/isa_traits.hh"
+#include "base/statistics.hh"
+#include "cpu/inst_seq.hh"

 #include "cpu/beta_cpu/2bit_local_pred.hh"
+#include "cpu/beta_cpu/tournament_pred.hh"
 #include "cpu/beta_cpu/btb.hh"
+#include "cpu/beta_cpu/ras.hh"
+
+#include <list>

 /**
 * Basically a wrapper class to hold both the branch predictor
@ -18,34 +24,86 @@
 * object, and be able to call the constructors on the BP and BTB.
 */
 template<class Impl>
-class DefaultBPredUnit
+class TwobitBPredUnit
 {
  public:
    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInstPtr DynInstPtr;

-    DefaultBPredUnit(Params &params);
+    TwobitBPredUnit(Params &params);
+
+    void regStats();
+
+    bool predict(DynInstPtr &inst, Addr &PC);
+
+    void squash(const InstSeqNum &squashed_sn, const Addr &corr_target,
+                bool actually_taken);
+
+    void squash(const InstSeqNum &squashed_sn);
+
+    void update(const InstSeqNum &done_sn);

    bool BPLookup(Addr &inst_PC)
    { return BP.lookup(inst_PC); }

+    unsigned BPReadGlobalHist()
+    { return 0; }
+
    bool BTBValid(Addr &inst_PC)
    { return BTB.valid(inst_PC); }

    Addr BTBLookup(Addr &inst_PC)
    { return BTB.lookup(inst_PC); }

-    void BPUpdate(Addr &inst_PC, bool taken)
+    // Will want to include global history.
+    void BPUpdate(Addr &inst_PC, unsigned global_history, bool taken)
    { BP.update(inst_PC, taken); }

    void BTBUpdate(Addr &inst_PC, Addr &target_PC)
    { BTB.update(inst_PC, target_PC); }

  private:
+    struct PredictorHistory {
+        PredictorHistory(const InstSeqNum &seq_num, const Addr &inst_PC,
+                         const bool pred_taken)
+            : seqNum(seq_num), PC(inst_PC), predTaken(pred_taken),
+              globalHistory(0), usedRAS(0), wasCall(0), RASIndex(0),
+              RASTarget(0)
+        { }
+
+        InstSeqNum seqNum;
+
+        Addr PC;
+
+        bool predTaken;
+
+        unsigned globalHistory;
+
+        bool usedRAS;
+
+        bool wasCall;
+
+        unsigned RASIndex;
+
+        Addr RASTarget;
+    };
+
+    std::list<PredictorHistory> predHist;

    DefaultBP BP;

    DefaultBTB BTB;

+    ReturnAddrStack RAS;
+
+    Stats::Scalar<> lookups;
+    Stats::Scalar<> condPredicted;
+    Stats::Scalar<> condIncorrect;
+    Stats::Scalar<> BTBLookups;
+    Stats::Scalar<> BTBHits;
+    Stats::Scalar<> BTBCorrect;
+    Stats::Scalar<> usedRAS;
+    Stats::Scalar<> RASIncorrect;
 };

 #endif // __BPRED_UNIT_HH__
--- a/cpu/beta_cpu/bpred_unit_impl.hh
+++ b/cpu/beta_cpu/bpred_unit_impl.hh
@ -1,13 +1,247 @@

 #include "cpu/beta_cpu/bpred_unit.hh"
+#include "base/traceflags.hh"
+#include "base/trace.hh"

 template<class Impl>
-DefaultBPredUnit<Impl>::DefaultBPredUnit(Params &params)
-  : BP(params.localPredictorSize,
-       params.localPredictorCtrBits,
+TwobitBPredUnit<Impl>::TwobitBPredUnit(Params &params)
+  : BP(params.local_predictor_size,
+       params.local_ctr_bits,
       params.instShiftAmt),
    BTB(params.BTBEntries,
        params.BTBTagSize,
-        params.instShiftAmt)
+        params.instShiftAmt),
+    RAS(params.RASSize)
 {
 }
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::regStats()
+{
+    lookups
+        .name(name() + ".BPredUnit.lookups")
+        .desc("Number of BP lookups")
+        ;
+
+    condPredicted
+        .name(name() + ".BPredUnit.condPredicted")
+        .desc("Number of conditional branches predicted")
+        ;
+
+    condIncorrect
+        .name(name() + ".BPredUnit.condIncorrect")
+        .desc("Number of conditional branches incorrect")
+        ;
+
+    BTBLookups
+        .name(name() + ".BPredUnit.BTBLookups")
+        .desc("Number of BTB lookups")
+        ;
+
+    BTBHits
+        .name(name() + ".BPredUnit.BTBHits")
+        .desc("Number of BTB hits")
+        ;
+
+    BTBCorrect
+        .name(name() + ".BPredUnit.BTBCorrect")
+        .desc("Number of correct BTB predictions (this stat may not "
+              "work properly.")
+        ;
+
+    usedRAS
+        .name(name() + ".BPredUnit.usedRAS")
+        .desc("Number of times the RAS was used.")
+        ;
+
+    RASIncorrect
+        .name(name() + ".BPredUnit.RASInCorrect")
+        .desc("Number of incorrect RAS predictions.")
+        ;
+}
+
+template <class Impl>
+bool
+TwobitBPredUnit<Impl>::predict(DynInstPtr &inst, Addr &PC)
+{
+    // See if branch predictor predicts taken.
+    // If so, get its target addr either from the BTB or the RAS.
+    // Once that's done, speculatively update the predictor?
+    // Save off record of branch stuff so the RAS can be fixed
+    // up once it's done.
+
+    bool pred_taken = false;
+    Addr target;
+
+    ++lookups;
+
+    if (inst->isUncondCtrl()) {
+        DPRINTF(Fetch, "BranchPred: Unconditional control.\n");
+        pred_taken = true;
+    } else {
+        ++condPredicted;
+
+        pred_taken = BPLookup(PC);
+
+        DPRINTF(Fetch, "BranchPred: Branch predictor predicted %i for PC %#x"
+                "\n", pred_taken, inst->readPC());
+    }
+
+    PredictorHistory predict_record(inst->seqNum, PC, pred_taken);
+
+    // Now lookup in the BTB or RAS.
+    if (pred_taken) {
+        if (inst->isReturn()) {
+            ++usedRAS;
+
+            // If it's a function return call, then look up the address
+            // in the RAS.
+            target = RAS.top();
+
+            // Record the top entry of the RAS, and its index.
+            predict_record.usedRAS = true;
+            predict_record.RASIndex = RAS.topIdx();
+            predict_record.RASTarget = target;
+
+            RAS.pop();
+
+            DPRINTF(Fetch, "BranchPred: Instruction %#x is a return, RAS "
+                    "predicted target: %#x, RAS index: %i.\n",
+                    inst->readPC(), target, predict_record.RASIndex);
+        } else {
+            ++BTBLookups;
+
+            if (inst->isCall()) {
+                RAS.push(PC+sizeof(MachInst));
+
+                // Record that it was a call so that the top RAS entry can
+                // be popped off if the speculation is incorrect.
+                predict_record.wasCall = true;
+
+                DPRINTF(Fetch, "BranchPred: Instruction %#x was a call, "
+                        "adding %#x to the RAS.\n",
+                        inst->readPC(), PC+sizeof(MachInst));
+            }
+
+            if (BTB.valid(PC)) {
+                ++BTBHits;
+
+                //If it's anything else, use the BTB to get the target addr.
+                target = BTB.lookup(PC);
+
+                DPRINTF(Fetch, "BranchPred: Instruction %#x predicted target "
+                        "is %#x.\n", inst->readPC(), target);
+
+            } else {
+                DPRINTF(Fetch, "BranchPred: BTB doesn't have a valid entry."
+                        "\n");
+                pred_taken = false;
+            }
+
+        }
+    }
+
+    if (pred_taken) {
+        // Set the PC and the instruction's predicted target.
+        PC = target;
+        inst->setPredTarg(target);
+    } else {
+        PC = PC + sizeof(MachInst);
+        inst->setPredTarg(PC);
+    }
+
+    predHist.push_front(predict_record);
+
+    assert(!predHist.empty());
+
+    return pred_taken;
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::update(const InstSeqNum &done_sn)
+{
+    DPRINTF(Fetch, "BranchPred: Commiting branches until sequence number "
+            "%i.\n", done_sn);
+
+    while (!predHist.empty() && predHist.back().seqNum <= done_sn) {
+        assert(!predHist.empty());
+
+        // Update the branch predictor with the correct results of branches.
+        BP.update(predHist.back().PC, predHist.back().predTaken);
+
+        predHist.pop_back();
+    }
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn)
+{
+    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
+        if (predHist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
+                    "target: %#x.\n",
+                    predHist.front().RASIndex,
+                    predHist.front().RASTarget);
+
+            RAS.restore(predHist.front().RASIndex,
+                        predHist.front().RASTarget);
+        } else if (predHist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
+                    "to the RAS.\n");
+
+            RAS.pop();
+        }
+
+        predHist.pop_front();
+    }
+}
+
+template <class Impl>
+void
+TwobitBPredUnit<Impl>::squash(const InstSeqNum &squashed_sn,
+                              const Addr &corr_target,
+                              const bool actually_taken)
+{
+    // Now that we know that a branch was mispredicted, we need to undo
+    // all the branches that have been seen up until this branch and
+    // fix up everything.
+
+    ++condIncorrect;
+
+    DPRINTF(Fetch, "BranchPred: Squashing from sequence number %i, "
+            "setting target to %#x.\n",
+            squashed_sn, corr_target);
+
+    while (!predHist.empty() && predHist.front().seqNum > squashed_sn) {
+
+        if (predHist.front().usedRAS) {
+            DPRINTF(Fetch, "BranchPred: Restoring top of RAS to: %i, "
+                    "target: %#x.\n",
+                    predHist.front().RASIndex,
+                    predHist.front().RASTarget);
+
+            RAS.restore(predHist.front().RASIndex,
+                        predHist.front().RASTarget);
+        } else if (predHist.front().wasCall) {
+            DPRINTF(Fetch, "BranchPred: Removing speculative entry added "
+                    "to the RAS.\n");
+
+            RAS.pop();
+        }
+
+        predHist.pop_front();
+    }
+
+    predHist.front().predTaken = actually_taken;
+
+    if (predHist.front().usedRAS) {
+        ++RASIncorrect;
+    }
+
+    BP.update(predHist.front().PC, actually_taken);
+
+    BTB.update(predHist.front().PC, corr_target);
+}
--- a/cpu/beta_cpu/btb.cc
+++ b/cpu/beta_cpu/btb.cc
@ -50,6 +50,8 @@ DefaultBTB::valid(const Addr &inst_PC)

    Addr inst_tag = getTag(inst_PC);

+    assert(btb_idx < numEntries);
+
    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
        return true;
    } else {
@ -67,6 +69,8 @@ DefaultBTB::lookup(const Addr &inst_PC)

    Addr inst_tag = getTag(inst_PC);

+    assert(btb_idx < numEntries);
+
    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
        return btb[btb_idx].target;
    } else {
@ -79,6 +83,8 @@ DefaultBTB::update(const Addr &inst_PC, const Addr &target)
 {
    unsigned btb_idx = getIndex(inst_PC);

+    assert(btb_idx < numEntries);
+
    btb[btb_idx].valid = true;
    btb[btb_idx].target = target;
    btb[btb_idx].tag = getTag(inst_PC);
--- a/cpu/beta_cpu/comm.hh
+++ b/cpu/beta_cpu/comm.hh
@ -9,6 +9,7 @@
 using namespace std;

 // Find better place to put this typedef.
+// The impl might be the best place for this.
 typedef short int PhysRegIndex;

 template<class Impl>
@ -45,6 +46,14 @@ struct SimpleIEWSimpleCommit {
    int size;

    DynInstPtr insts[Impl::MaxWidth + 1];
+
+    bool squash;
+    bool branchMispredict;
+    bool branchTaken;
+    uint64_t mispredPC;
+    uint64_t nextPC;
+    unsigned globalHist;
+    InstSeqNum squashedSeqNum;
 };

 template<class Impl>
@ -63,10 +72,15 @@ struct TimeBufStruct {
        bool predIncorrect;
        uint64_t branchAddr;

+        InstSeqNum doneSeqNum;
+
+        // Might want to package this kind of branch stuff into a single
+        // struct as it is used pretty frequently.
        bool branchMispredict;
        bool branchTaken;
        uint64_t mispredPC;
        uint64_t nextPC;
+        unsigned globalHist;
    };

    decodeComm decodeInfo;
@ -84,17 +98,10 @@ struct TimeBufStruct {
    renameComm renameInfo;

    struct iewComm {
-        bool squash;
        bool stall;

        // Also eventually include skid buffer space.
        unsigned freeIQEntries;
-
-        bool branchMispredict;
-        bool branchTaken;
-        uint64_t mispredPC;
-        uint64_t nextPC;
-        InstSeqNum squashedSeqNum;
    };

    iewComm iewInfo;
@ -108,6 +115,7 @@ struct TimeBufStruct {
        bool branchTaken;
        uint64_t mispredPC;
        uint64_t nextPC;
+        unsigned globalHist;

        // Think of better names here.
        // Will need to be a variety of sizes...
--- a/cpu/beta_cpu/commit.hh
+++ b/cpu/beta_cpu/commit.hh
@ -59,6 +59,8 @@ class SimpleCommit
  public:
    SimpleCommit(Params &params);

+    void regStats();
+
    void setCPU(FullCPU *cpu_ptr);

    void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@ -142,6 +144,17 @@ class SimpleCommit

    /** Commit width, in instructions. */
    unsigned commitWidth;
+
+    Stats::Scalar<> commitCommittedInsts;
+    Stats::Scalar<> commitSquashedInsts;
+    Stats::Scalar<> commitSquashEvents;
+    Stats::Scalar<> commitNonSpecStalls;
+    Stats::Scalar<> commitCommittedBranches;
+    Stats::Scalar<> commitCommittedLoads;
+    Stats::Scalar<> commitCommittedMemRefs;
+    Stats::Scalar<> branchMispredicts;
+
+    Stats::Distribution<> n_committed_dist;
 };

 #endif // __SIMPLE_COMMIT_HH__
--- a/cpu/beta_cpu/commit_impl.hh
+++ b/cpu/beta_cpu/commit_impl.hh
@ -21,6 +21,51 @@ SimpleCommit<Impl>::SimpleCommit(Params &params)
    _status = Idle;
 }

+template <class Impl>
+void
+SimpleCommit<Impl>::regStats()
+{
+    commitCommittedInsts
+        .name(name() + ".commitCommittedInsts")
+        .desc("The number of committed instructions")
+        .prereq(commitCommittedInsts);
+    commitSquashedInsts
+        .name(name() + ".commitSquashedInsts")
+        .desc("The number of squashed insts skipped by commit")
+        .prereq(commitSquashedInsts);
+    commitSquashEvents
+        .name(name() + ".commitSquashEvents")
+        .desc("The number of times commit is told to squash")
+        .prereq(commitSquashEvents);
+    commitNonSpecStalls
+        .name(name() + ".commitNonSpecStalls")
+        .desc("The number of times commit has been forced to stall to "
+              "communicate backwards")
+        .prereq(commitNonSpecStalls);
+    commitCommittedBranches
+        .name(name() + ".commitCommittedBranches")
+        .desc("The number of committed branches")
+        .prereq(commitCommittedBranches);
+    commitCommittedLoads
+        .name(name() + ".commitCommittedLoads")
+        .desc("The number of committed loads")
+        .prereq(commitCommittedLoads);
+    commitCommittedMemRefs
+        .name(name() + ".commitCommittedMemRefs")
+        .desc("The number of committed memory references")
+        .prereq(commitCommittedMemRefs);
+    branchMispredicts
+        .name(name() + ".branchMispredicts")
+        .desc("The number of times a branch was mispredicted")
+        .prereq(branchMispredicts);
+    n_committed_dist
+        .init(0,commitWidth,1)
+        .name(name() + ".COM:committed_per_cycle")
+        .desc("Number of insts commited each cycle")
+        .flags(Stats::pdf)
+        ;
+}
+
 template <class Impl>
 void
 SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr)
@ -143,12 +188,12 @@ SimpleCommit<Impl>::commit()
    // Should I also check if the commit stage is telling the ROB to squah?
    // This might be necessary to keep the same timing between the IQ and
    // the ROB...
-    if (robInfoFromIEW->iewInfo.squash) {
+    if (fromIEW->squash) {
        DPRINTF(Commit, "Commit: Squashing instructions in the ROB.\n");

        _status = ROBSquashing;

-        InstSeqNum squashed_inst = robInfoFromIEW->iewInfo.squashedSeqNum;
+        InstSeqNum squashed_inst = fromIEW->squashedSeqNum;

        rob->squash(squashed_inst);

@ -162,15 +207,19 @@ SimpleCommit<Impl>::commit()
        // ROB is in the process of squashing.
        toIEW->commitInfo.robSquashing = true;

-        toIEW->commitInfo.branchMispredict =
-            robInfoFromIEW->iewInfo.branchMispredict;
+        toIEW->commitInfo.branchMispredict = fromIEW->branchMispredict;

-        toIEW->commitInfo.branchTaken =
-            robInfoFromIEW->iewInfo.branchTaken;
+        toIEW->commitInfo.branchTaken = fromIEW->branchTaken;

-        toIEW->commitInfo.nextPC = robInfoFromIEW->iewInfo.nextPC;
+        toIEW->commitInfo.nextPC = fromIEW->nextPC;

-        toIEW->commitInfo.mispredPC = robInfoFromIEW->iewInfo.mispredPC;
+        toIEW->commitInfo.mispredPC = fromIEW->mispredPC;
+
+        toIEW->commitInfo.globalHist = fromIEW->globalHist;
+
+        if (toIEW->commitInfo.branchMispredict) {
+            ++branchMispredicts;
+        }
    }

    if (_status != ROBSquashing) {
@ -237,6 +286,8 @@ SimpleCommit<Impl>::commitInsts()
            // inst in the ROB without affecting any other stages.
            rob->retireHead();

+            ++commitSquashedInsts;
+
        } else {
            // Increment the total number of non-speculative instructions
            // executed.
@ -249,7 +300,7 @@ SimpleCommit<Impl>::commitInsts()
            bool commit_success = commitHead(head_inst, num_committed);

            // Update what instruction we are looking at if the commit worked.
-            if(commit_success) {
+            if (commit_success) {
                ++num_committed;

                // Send back which instruction has been committed.
@ -258,7 +309,11 @@ SimpleCommit<Impl>::commitInsts()
                // sequence number instead (copy).
                toIEW->commitInfo.doneSeqNum = head_inst->seqNum;

-                cpu->instDone();
+                ++commitCommittedInsts;
+
+                if (!head_inst->isNop()) {
+                    cpu->instDone();
+                }
            } else {
                break;
            }
@ -267,6 +322,8 @@ SimpleCommit<Impl>::commitInsts()
        // Update the pointer to read the next instruction in the ROB.
        head_inst = rob->readHeadInst();
    }
+
+    n_committed_dist.sample(num_committed);
 }

 template <class Impl>
@ -276,18 +333,13 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
    // Make sure instruction is valid
    assert(head_inst);

-    Fault fault = No_Fault;
-
-    // If the head instruction is a store or a load, then execute it
-    // because this simple model does no speculative memory access.
-    // Hopefully this covers all memory references.
-    // Also check if it's nonspeculative.  Or a nop.  Then it will be
-    // executed only when it reaches the head of the ROB.  Actually
-    // executing a nop is a bit overkill...
+    // If the instruction is not executed yet, then it is a non-speculative
+    // or store inst.  Signal backwards that it should be executed.
    if (!head_inst->isExecuted()) {
        // Keep this number correct.  We have not yet actually executed
        // and committed this instruction.
        cpu->funcExeInst--;
+
        if (head_inst->isStore() || head_inst->isNonSpeculative()) {
            DPRINTF(Commit, "Commit: Encountered a store or non-speculative "
                    "instruction at the head of the ROB, PC %#x.\n",
@ -299,6 +351,8 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
            // it is executed.
            head_inst->clearCanCommit();

+            ++commitNonSpecStalls;
+
            return false;
        } else {
            panic("Commit: Trying to commit un-executed instruction "
@ -306,19 +360,6 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
        }
    }

-    // Check if memory access was successful.
-    if (fault != No_Fault) {
-        // Handle data cache miss here.  In the future, set the status
-        // to data cache miss, then exit the stage.  Have an event
-        // that handles commiting the head instruction, then setting
-        // the stage back to running, when the event is run.  (just
-        // make sure that event is commit's run for that cycle)
-        panic("Commit: Load/store instruction failed, not sure what "
-              "to do.\n");
-        // Also will want to clear the instruction's fault after being
-        // handled here so it's not handled again below.
-    }
-
    // Now check if it's one of the special trap or barrier or
    // serializing instructions.
    if (head_inst->isThreadSync()  ||
@ -335,39 +376,43 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)

    // Check if the instruction caused a fault.  If so, trap.
    if (head_inst->getFault() != No_Fault) {
-#ifdef FULL_SYSTEM
-        cpu->trap(fault);
-#else // !FULL_SYSTEM
        if (!head_inst->isNop()) {
+#ifdef FULL_SYSTEM
+            cpu->trap(fault);
+#else // !FULL_SYSTEM
            panic("fault (%d) detected @ PC %08p", head_inst->getFault(),
                  head_inst->PC);
-        }
 #endif // FULL_SYSTEM
+        }
    }

    // Check if we're really ready to commit.  If not then return false.
    // I'm pretty sure all instructions should be able to commit if they've
    // reached this far.  For now leave this in as a check.
    if(!rob->isHeadReady()) {
-        DPRINTF(Commit, "Commit: Unable to commit head instruction!\n");
+        panic("Commit: Unable to commit head instruction!\n");
        return false;
    }

    // If it's a branch, then send back branch prediction update info
    // to the fetch stage.
    // This should be handled in the iew stage if a mispredict happens...
-#if 0
+
    if (head_inst->isControl()) {

+#if 0
        toIEW->nextPC = head_inst->readPC();
        //Maybe switch over to BTB incorrect.
        toIEW->btbMissed = head_inst->btbMiss();
        toIEW->target = head_inst->nextPC;
        //Maybe also include global history information.
        //This simple version will have no branch prediction however.
-    }
 #endif

+        ++commitCommittedBranches;
+    }
+
+
 #if 0
    // Check if the instruction has a destination register.
    // If so add the previous physical register of its logical register's
@ -383,8 +428,12 @@ SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
    // the LDSTQ will already have been told that a store has reached the head
    // of the ROB.  Consider including communication if it's a store as well
    // to keep things orthagonal.
-    if (head_inst->isLoad()) {
-        toIEW->commitInfo.commitIsLoad = true;
+    if (head_inst->isMemRef()) {
+        ++commitCommittedMemRefs;
+        if (head_inst->isLoad()) {
+            toIEW->commitInfo.commitIsLoad = true;
+            ++commitCommittedLoads;
+        }
    }

    // Now that the instruction is going to be committed, finalize its
--- a/cpu/beta_cpu/cpu_policy.hh
+++ b/cpu/beta_cpu/cpu_policy.hh
@ -22,7 +22,7 @@
 template<class Impl>
 struct SimpleCPUPolicy
 {
-    typedef DefaultBPredUnit<Impl> BPredUnit;
+    typedef TwobitBPredUnit<Impl> BPredUnit;
    typedef PhysRegFile<Impl> RegFile;
    typedef SimpleFreeList FreeList;
    typedef SimpleRenameMap RenameMap;
--- a/cpu/beta_cpu/decode.hh
+++ b/cpu/beta_cpu/decode.hh
@ -49,6 +49,8 @@ class SimpleDecode
  public:
    SimpleDecode(Params &params);

+    void regStats();
+
    void setCPU(FullCPU *cpu_ptr);

    void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@ -128,6 +130,15 @@ class SimpleDecode
     *  group of instructions, it can restart at the proper instruction.
     */
    unsigned numInst;
+
+    Stats::Scalar<> decodeIdleCycles;
+    Stats::Scalar<> decodeBlockedCycles;
+    Stats::Scalar<> decodeUnblockCycles;
+    Stats::Scalar<> decodeSquashCycles;
+    Stats::Scalar<> decodeBranchMispred;
+    Stats::Scalar<> decodeControlMispred;
+    Stats::Scalar<> decodeDecodedInsts;
+    Stats::Scalar<> decodeSquashedInsts;
 };

 #endif // __SIMPLE_DECODE_HH__
--- a/cpu/beta_cpu/decode_impl.hh
+++ b/cpu/beta_cpu/decode_impl.hh
@ -16,6 +16,45 @@ SimpleDecode<Impl>::SimpleDecode(Params &params)
    _status = Idle;
 }

+template <class Impl>
+void
+SimpleDecode<Impl>::regStats()
+{
+    decodeIdleCycles
+        .name(name() + ".decodeIdleCycles")
+        .desc("Number of cycles decode is idle")
+        .prereq(decodeIdleCycles);
+    decodeBlockedCycles
+        .name(name() + ".decodeBlockedCycles")
+        .desc("Number of cycles decode is blocked")
+        .prereq(decodeBlockedCycles);
+    decodeUnblockCycles
+        .name(name() + ".decodeUnblockCycles")
+        .desc("Number of cycles decode is unblocking")
+        .prereq(decodeUnblockCycles);
+    decodeSquashCycles
+        .name(name() + ".decodeSquashCycles")
+        .desc("Number of cycles decode is squashing")
+        .prereq(decodeSquashCycles);
+    decodeBranchMispred
+        .name(name() + ".decodeBranchMispred")
+        .desc("Number of times decode detected a branch misprediction")
+        .prereq(decodeBranchMispred);
+    decodeControlMispred
+        .name(name() + ".decodeControlMispred")
+        .desc("Number of times decode detected an instruction incorrectly"
+              " predicted as a control")
+        .prereq(decodeControlMispred);
+    decodeDecodedInsts
+        .name(name() + ".decodeDecodedInsts")
+        .desc("Number of instructions handled by decode")
+        .prereq(decodeDecodedInsts);
+    decodeSquashedInsts
+        .name(name() + ".decodeSquashedInsts")
+        .desc("Number of squashed instructions handled by decode")
+        .prereq(decodeSquashedInsts);
+}
+
 template<class Impl>
 void
 SimpleDecode<Impl>::setCPU(FullCPU *cpu_ptr)
@ -91,7 +130,7 @@ SimpleDecode<Impl>::unblock()

    // If there's still information in the skid buffer, then
    // continue to tell previous stages to stall.  They will be
-            // able to restart once the skid buffer is empty.
+    // able to restart once the skid buffer is empty.
    if (!skidBuffer.empty()) {
        toFetch->decodeInfo.stall = true;
    } else {
@ -110,9 +149,12 @@ SimpleDecode<Impl>::squash(DynInstPtr &inst)
                    "detected at decode.\n");
    Addr new_PC = inst->nextPC;

+    toFetch->decodeInfo.branchMispredict = true;
+    toFetch->decodeInfo.doneSeqNum = inst->seqNum;
    toFetch->decodeInfo.predIncorrect = true;
    toFetch->decodeInfo.squash = true;
    toFetch->decodeInfo.nextPC = new_PC;
+    toFetch->decodeInfo.branchTaken = true;

    // Set status to squashing.
    _status = Squashing;
@ -164,6 +206,8 @@ SimpleDecode<Impl>::tick()
        // buffer were used.  Remove those instructions and handle
        // the rest of unblocking.
        if (_status == Unblocking) {
+            ++decodeUnblockCycles;
+
            if (fromFetch->size > 0) {
                // Add the current inputs to the skid buffer so they can be
                // reprocessed when this stage unblocks.
@ -173,6 +217,8 @@ SimpleDecode<Impl>::tick()
            unblock();
        }
    } else if (_status == Blocked) {
+        ++decodeBlockedCycles;
+
        if (fromFetch->size > 0) {
            block();
        }
@ -197,6 +243,8 @@ SimpleDecode<Impl>::tick()
            squash();
        }
    } else if (_status == Squashing) {
+        ++decodeSquashCycles;
+
        if (!fromCommit->commitInfo.squash &&
            !fromCommit->commitInfo.robSquashing) {
            _status = Running;
@ -228,17 +276,16 @@ SimpleDecode<Impl>::decode()
    // Check fetch queue to see if instructions are available.
    // If no available instructions, do nothing, unless this stage is
    // currently unblocking.
-    if (!fromFetch->insts[0] && _status != Unblocking) {
+    if (fromFetch->size == 0 && _status != Unblocking) {
        DPRINTF(Decode, "Decode: Nothing to do, breaking out early.\n");
        // Should I change the status to idle?
+        ++decodeIdleCycles;
        return;
    }

+    // Might be better to use a base DynInst * instead?
    DynInstPtr inst;

-    // Instead have a class member variable that records which instruction
-    // was the last one that was ended on.  At the tick() stage, it can
-    // check if that's equal to 0.  If not, then don't pop stuff off.
    unsigned to_rename_index = 0;

    int insts_available = _status == Unblocking ?
@ -264,18 +311,10 @@ SimpleDecode<Impl>::decode()
    }
 #endif

-    // Check to make sure that instructions coming from fetch are valid.
-    // Normally at this stage the branch target of PC-relative branches
-    // should be computed here.  However in this simple model all
-    // computation will take place at execute.  Hence doneTargCalc()
-    // will always be false.
     while (insts_available > 0)
     {
        DPRINTF(Decode, "Decode: Sending instruction to rename.\n");
-        // Might create some sort of accessor to get an instruction
-        // on a per thread basis.  Or might be faster to just get
-        // a pointer to an array or list of instructions and use that
-        // within this code.
+
        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
               fromFetch->insts[numInst];

@ -287,6 +326,8 @@ SimpleDecode<Impl>::decode()
                    "squashed, skipping.\n",
                    inst->seqNum, inst->readPC());

+            ++decodeSquashedInsts;
+
            ++numInst;
            --insts_available;

@ -305,16 +346,22 @@ SimpleDecode<Impl>::decode()
        if (inst->predTaken() && !inst->isControl()) {
            panic("Instruction predicted as a branch!");

+            ++decodeControlMispred;
            // Might want to set some sort of boolean and just do
            // a check at the end
            squash(inst);
            break;
        }

-        // Ensure that the predicted branch target is the actual branch
-        // target if possible (branches that are PC relative).
-        if (inst->isControl() && inst->doneTargCalc()) {
+        // Go ahead and compute any PC-relative branches.
+
+        if (inst->isDirectCtrl() && inst->isUncondCtrl() &&
+            inst->numDestRegs() == 0 && inst->numSrcRegs() == 0) {
+            inst->execute();
+            inst->setExecuted();
+
            if (inst->mispredicted()) {
+                ++decodeBranchMispred;
                // Might want to set some sort of boolean and just do
                // a check at the end
                squash(inst);
@ -322,6 +369,11 @@ SimpleDecode<Impl>::decode()
            }
        }

+        // Normally can check if a direct branch has the right target
+        // addr (either the immediate, or the branch PC + 4) and redirect
+        // fetch if it's incorrect.
+
+
        // Also check if instructions have no source registers.  Mark
        // them as ready to issue at any time.  Not sure if this check
        // should exist here or at a later stage; however it doesn't matter
@ -334,6 +386,7 @@ SimpleDecode<Impl>::decode()
        // Increment which instruction we're looking at.
        ++numInst;
        ++to_rename_index;
+        ++decodeDecodedInsts;

        --insts_available;
    }
--- a/cpu/beta_cpu/fetch.hh
+++ b/cpu/beta_cpu/fetch.hh
@ -14,6 +14,7 @@
 #include "sim/eventq.hh"
 #include "cpu/pc_event.hh"
 #include "mem/mem_interface.hh"
+#include "base/statistics.hh"

 /**
 * SimpleFetch class to fetch a single instruction each cycle.  SimpleFetch
@ -59,6 +60,8 @@ class SimpleFetch
    /** SimpleFetch constructor. */
    SimpleFetch(Params &params);

+    void regStats();
+
    void setCPU(FullCPU *cpu_ptr);

    void setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer);
@ -73,9 +76,13 @@ class SimpleFetch

 //  private:
    // Figure out PC vs next PC and how it should be updated
-    void squash(Addr newPC);
+    void squash(const Addr &new_PC);

  private:
+    inline void doSquash(const Addr &new_PC);
+
+    void squashFromDecode(const Addr &new_PC, const InstSeqNum &seq_num);
+
    /**
     * Looks up in the branch predictor to see if the next PC should be
     * either next PC+=MachInst or a branch target.
@ -84,7 +91,27 @@ class SimpleFetch
     * the next PC will be.
     * @return Whether or not a branch was predicted as taken.
     */
-    bool lookupAndUpdateNextPC(Addr &next_PC);
+    bool lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC);
+
+    // Might not want this function...
+//    inline void recordGlobalHist(DynInstPtr &inst);
+
+    /**
+     * Fetches the cache line that contains fetch_PC.  Returns any
+     * fault that happened.  Puts the data into the class variable
+     * cacheData.
+     * @params fetch_PC The PC address that is being fetched from.
+     * @return Any fault that occured.
+     */
+    Fault fetchCacheLine(Addr fetch_PC);
+
+    // Align an address (typically a PC) to the start of an I-cache block.
+    // We fold in the PISA 64- to 32-bit conversion here as well.
+    Addr icacheBlockAlignPC(Addr addr)
+    {
+        addr = ISA::realPCToFetchPC(addr);
+        return (addr & ~(cacheBlkMask));
+    }

  public:
    class CacheCompletionEvent : public Event
@ -99,7 +126,7 @@ class SimpleFetch
        virtual const char *description();
    };

-    CacheCompletionEvent cacheCompletionEvent;
+//    CacheCompletionEvent cacheCompletionEvent;

  private:
    /** Pointer to the FullCPU. */
@ -152,20 +179,32 @@ class SimpleFetch
    unsigned fetchWidth;

    /** Cache block size. */
-    int blkSize;
+    int cacheBlkSize;

    /** Mask to get a cache block's address. */
-    Addr cacheBlockMask;
+    Addr cacheBlkMask;

    /** The instruction being fetched. */
-    MachInst inst;
+//    MachInst inst;
+
+    /** The cache line being fetched. */
+    uint8_t *cacheData;

    /** Size of instructions. */
    int instSize;

    /** Icache stall statistics. */
-//     Stats::Scalar<> icacheStallCycles;
-//     Counter lastIcacheStall;
+    Counter lastIcacheStall;
+
+    Stats::Scalar<> icacheStallCycles;
+    Stats::Scalar<> fetchedInsts;
+    Stats::Scalar<> predictedBranches;
+    Stats::Scalar<> fetchCycles;
+    Stats::Scalar<> fetchSquashCycles;
+    Stats::Scalar<> fetchBlockedCycles;
+    Stats::Scalar<> fetchedCacheLines;
+
+    Stats::Distribution<> fetch_nisn_dist;
 };

 #endif //__SIMPLE_FETCH_HH__
--- a/cpu/beta_cpu/fetch_impl.hh
+++ b/cpu/beta_cpu/fetch_impl.hh
@ -1,10 +1,8 @@
-// Todo: Add in branch prediction.  With probe path, should
-// be able to specify
-// size of data to fetch.  Will be able to get full cache line.
-
-// Remove this later.
+// Remove this later; used only for debugging.
 #define OPCODE(X)                       (X >> 26) & 0x3f

+
+#include "arch/alpha/byte_swap.hh"
 #include "cpu/exetrace.hh"
 #include "mem/base_mem.hh"
 #include "mem/mem_interface.hh"
@ -37,15 +35,14 @@ SimpleFetch<Impl>::CacheCompletionEvent::description()

 template<class Impl>
 SimpleFetch<Impl>::SimpleFetch(Params &params)
-    : cacheCompletionEvent(this),
+    : //cacheCompletionEvent(this),
      icacheInterface(params.icacheInterface),
      branchPred(params),
      decodeToFetchDelay(params.decodeToFetchDelay),
      renameToFetchDelay(params.renameToFetchDelay),
      iewToFetchDelay(params.iewToFetchDelay),
      commitToFetchDelay(params.commitToFetchDelay),
-      fetchWidth(params.fetchWidth),
-      inst(0)
+      fetchWidth(params.fetchWidth)
 {
    // Set status to idle.
    _status = Idle;
@ -62,13 +59,63 @@ SimpleFetch<Impl>::SimpleFetch(Params &params)
    memReq->data = new uint8_t[64];

    // Size of cache block.
-    blkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
+    cacheBlkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;

    // Create mask to get rid of offset bits.
-    cacheBlockMask = (blkSize - 1);
+    cacheBlkMask = (cacheBlkSize - 1);

    // Get the size of an instruction.
    instSize = sizeof(MachInst);
+
+    // Create space to store a cache line.
+    cacheData = new uint8_t[cacheBlkSize];
+}
+
+template <class Impl>
+void
+SimpleFetch<Impl>::regStats()
+{
+    icacheStallCycles
+        .name(name() + ".icacheStallCycles")
+        .desc("Number of cycles fetch is stalled on an Icache miss")
+        .prereq(icacheStallCycles);
+
+    fetchedInsts
+        .name(name() + ".fetchedInsts")
+        .desc("Number of instructions fetch has processed")
+        .prereq(fetchedInsts);
+    predictedBranches
+        .name(name() + ".predictedBranches")
+        .desc("Number of branches that fetch has predicted taken")
+        .prereq(predictedBranches);
+    fetchCycles
+        .name(name() + ".fetchCycles")
+        .desc("Number of cycles fetch has run and was not squashing or"
+              " blocked")
+        .prereq(fetchCycles);
+    fetchSquashCycles
+        .name(name() + ".fetchSquashCycles")
+        .desc("Number of cycles fetch has spent squashing")
+        .prereq(fetchSquashCycles);
+    fetchBlockedCycles
+        .name(name() + ".fetchBlockedCycles")
+        .desc("Number of cycles fetch has spent blocked")
+        .prereq(fetchBlockedCycles);
+    fetchedCacheLines
+        .name(name() + ".fetchedCacheLines")
+        .desc("Number of cache lines fetched")
+        .prereq(fetchedCacheLines);
+
+    fetch_nisn_dist
+        .init(/* base value */ 0,
+              /* last value */ fetchWidth,
+              /* bucket size */ 1)
+        .name(name() + ".FETCH:rate_dist")
+        .desc("Number of instructions fetched each cycle (Total)")
+        .flags(Stats::pdf)
+        ;
+
+    branchPred.regStats();
 }

 template<class Impl>
@ -122,19 +169,40 @@ SimpleFetch<Impl>::processCacheCompletion()
        _status = IcacheMissComplete;
 }

-template<class Impl>
-bool
-SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC)
+#if 0
+template <class Impl>
+inline void
+SimpleFetch<Impl>::recordGlobalHist(DynInstPtr &inst)
+{
+    inst->setGlobalHist(branchPred.BPReadGlobalHist());
+}
+#endif
+
+template <class Impl>
+bool
+SimpleFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC)
 {
-#if 1
    // Do branch prediction check here.
-    bool predict_taken =  branchPred.BPLookup(next_PC);
-    Addr predict_target;
+    // A bit of a misnomer...next_PC is actually the current PC until
+    // this function updates it.
+    bool predict_taken;
+
+    if (!inst->isControl()) {
+        next_PC = next_PC + instSize;
+        inst->setPredTarg(next_PC);
+        return false;
+    }
+
+    predict_taken = branchPred.predict(inst, next_PC);
+
+#if 0
+    predict_taken = branchPred.BPLookup(next_PC)

    DPRINTF(Fetch, "Fetch: Branch predictor predicts taken? %i\n",
            predict_taken);

-    if (branchPred.BTBValid(next_PC)) {
+    // Only check the BTB if the BP has predicted taken.
+    if (predict_taken && branchPred.BTBValid(next_PC)) {
        predict_target = branchPred.BTBLookup(next_PC);
        DPRINTF(Fetch, "Fetch: BTB target is %#x.\n", predict_target);
    } else {
@ -142,42 +210,135 @@ SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC)
        DPRINTF(Fetch, "Fetch: BTB does not have a valid entry.\n");
    }

-    // Now update the PC to fetch the next instruction in the cache
-    // line.
-    if (!predict_taken) {
-        next_PC = next_PC + instSize;
-        return false;
-    } else {
-        next_PC = predict_target;
-        return true;
+#endif
+    if (predict_taken) {
+        ++predictedBranches;
    }
-#endif

-#if 0
-    next_PC = next_PC + instSize;
-    return false;
-#endif
+    return predict_taken;
 }

-template<class Impl>
-void
-SimpleFetch<Impl>::squash(Addr new_PC)
+template <class Impl>
+Fault
+SimpleFetch<Impl>::fetchCacheLine(Addr fetch_PC)
+{
+    // Check if the instruction exists within the cache.
+    // If it does, then proceed on to read the instruction and the rest
+    // of the instructions in the cache line until either the end of the
+    // cache line or a predicted taken branch is encountered.
+
+#ifdef FULL_SYSTEM
+    // Flag to say whether or not address is physical addr.
+    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
+#else
+    unsigned flags = 0;
+#endif // FULL_SYSTEM
+
+    Fault fault = No_Fault;
+
+    // Align the fetch PC so it's at the start of a cache block.
+    fetch_PC = icacheBlockAlignPC(fetch_PC);
+
+    // Setup the memReq to do a read of the first isntruction's address.
+    // Set the appropriate read size and flags as well.
+    memReq->cmd = Read;
+    memReq->reset(fetch_PC, cacheBlkSize, flags);
+
+    // Translate the instruction request.
+    // Should this function be
+    // in the CPU class ?  Probably...ITB/DTB should exist within the
+    // CPU.
+
+    fault = cpu->translateInstReq(memReq);
+
+    // In the case of faults, the fetch stage may need to stall and wait
+    // on what caused the fetch (ITB or Icache miss).
+
+    // If translation was successful, attempt to read the first
+    // instruction.
+    if (fault == No_Fault) {
+        DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
+        fault = cpu->mem->read(memReq, cacheData);
+        // This read may change when the mem interface changes.
+
+        fetchedCacheLines++;
+    }
+
+    // Now do the timing access to see whether or not the instruction
+    // exists within the cache.
+    if (icacheInterface && fault == No_Fault) {
+        DPRINTF(Fetch, "Fetch: Doing timing memory access.\n");
+        memReq->completionEvent = NULL;
+
+        memReq->time = curTick;
+
+        MemAccessResult result = icacheInterface->access(memReq);
+
+        // If the cache missed (in this model functional and timing
+        // memories are different), then schedule an event to wake
+        // up this stage once the cache miss completes.
+        if (result != MA_HIT && icacheInterface->doEvents()) {
+            memReq->completionEvent = new CacheCompletionEvent(this);
+//            lastIcacheStall = curTick;
+
+            // How does current model work as far as individual
+            // stages scheduling/unscheduling?
+            // Perhaps have only the main CPU scheduled/unscheduled,
+            // and have it choose what stages to run appropriately.
+
+            DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n");
+            _status = IcacheMissStall;
+        }
+    }
+
+    return fault;
+}
+
+template <class Impl>
+inline void
+SimpleFetch<Impl>::doSquash(const Addr &new_PC)
 {
    DPRINTF(Fetch, "Fetch: Squashing, setting PC to: %#x.\n", new_PC);

    cpu->setNextPC(new_PC + instSize);
    cpu->setPC(new_PC);

-    _status = Squashing;
-
    // Clear the icache miss if it's outstanding.
    if (_status == IcacheMissStall && icacheInterface) {
+        DPRINTF(Fetch, "Fetch: Squashing outstanding Icache miss.\n");
        // @todo: Use an actual thread number here.
        icacheInterface->squash(0);
    }

-    // Tell the CPU to remove any instructions that aren't currently
-    // in the ROB (instructions in flight that were killed).
+    _status = Squashing;
+
+    ++fetchSquashCycles;
+}
+
+template<class Impl>
+void
+SimpleFetch<Impl>::squashFromDecode(const Addr &new_PC,
+                                    const InstSeqNum &seq_num)
+{
+    DPRINTF(Fetch, "Fetch: Squashing from decode.\n");
+
+    doSquash(new_PC);
+
+    // Tell the CPU to remove any instructions that are in flight between
+    // fetch and decode.
+    cpu->removeInstsUntil(seq_num);
+
+}
+
+template <class Impl>
+void
+SimpleFetch<Impl>::squash(const Addr &new_PC)
+{
+    DPRINTF(Fetch, "Fetch: Squash from commit.\n");
+
+    doSquash(new_PC);
+
+    // Tell the CPU to remove any instructions that are not in the ROB.
    cpu->removeInstsNotInROB();
 }

@ -185,7 +346,6 @@ template<class Impl>
 void
 SimpleFetch<Impl>::tick()
 {
-#if 1
    // Check squash signals from commit.
    if (fromCommit->commitInfo.squash) {
        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
@ -196,13 +356,18 @@ SimpleFetch<Impl>::tick()

        // Also check if there's a mispredict that happened.
        if (fromCommit->commitInfo.branchMispredict) {
-            branchPred.BPUpdate(fromCommit->commitInfo.mispredPC,
-                                 fromCommit->commitInfo.branchTaken);
-            branchPred.BTBUpdate(fromCommit->commitInfo.mispredPC,
-                                  fromCommit->commitInfo.nextPC);
+            branchPred.squash(fromCommit->commitInfo.doneSeqNum,
+                              fromCommit->commitInfo.nextPC,
+                              fromCommit->commitInfo.branchTaken);
+        } else {
+            branchPred.squash(fromCommit->commitInfo.doneSeqNum);
        }

        return;
+    } else if (fromCommit->commitInfo.doneSeqNum) {
+        // Update the branch predictor if it wasn't a squashed instruction
+        // that was braodcasted.
+        branchPred.update(fromCommit->commitInfo.doneSeqNum);
    }

    // Check ROB squash signals from commit.
@ -211,6 +376,8 @@ SimpleFetch<Impl>::tick()

        // Continue to squash.
        _status = Squashing;
+
+        ++fetchSquashCycles;
        return;
    }

@ -220,22 +387,22 @@ SimpleFetch<Impl>::tick()
                "from decode.\n");

        // Update the branch predictor.
-        if (fromCommit->decodeInfo.branchMispredict) {
-            branchPred.BPUpdate(fromDecode->decodeInfo.mispredPC,
-                                 fromDecode->decodeInfo.branchTaken);
-            branchPred.BTBUpdate(fromDecode->decodeInfo.mispredPC,
-                                  fromDecode->decodeInfo.nextPC);
+        if (fromDecode->decodeInfo.branchMispredict) {
+            branchPred.squash(fromDecode->decodeInfo.doneSeqNum,
+                              fromDecode->decodeInfo.nextPC,
+                              fromDecode->decodeInfo.branchTaken);
+        } else {
+            branchPred.squash(fromDecode->decodeInfo.doneSeqNum);
        }

        if (_status != Squashing) {
            // Squash unless we're already squashing?
-            squash(fromDecode->decodeInfo.nextPC);
+            squashFromDecode(fromDecode->decodeInfo.nextPC,
+                             fromDecode->decodeInfo.doneSeqNum);
            return;
        }
    }

-
-
    // Check if any of the stall signals are high.
    if (fromDecode->decodeInfo.stall ||
        fromRename->renameInfo.stall ||
@ -253,12 +420,15 @@ SimpleFetch<Impl>::tick()
                fromCommit->commitInfo.stall);

        _status = Blocked;
+
+        ++fetchBlockedCycles;
        return;
    } else if (_status == Blocked) {
        // Unblock stage if status is currently blocked and none of the
        // stall signals are being held high.
        _status = Running;

+        ++fetchBlockedCycles;
        return;
    }

@ -273,74 +443,15 @@ SimpleFetch<Impl>::tick()

        // Switch status to running
        _status = Running;
+
+        ++fetchSquashCycles;
    } else if (_status != IcacheMissStall) {
        DPRINTF(Fetch, "Fetch: Running stage.\n");

-        fetch();
-    }
-#endif
-
-#if 0
-    if (_status != Blocked &&
-        _status != Squashing &&
-        _status != IcacheMissStall) {
-        DPRINTF(Fetch, "Fetch: Running stage.\n");
+        ++fetchCycles;

        fetch();
-    } else if (_status == Blocked) {
-        // If still being told to stall, do nothing.
-        if (fromDecode->decodeInfo.stall ||
-            fromRename->renameInfo.stall ||
-            fromIEW->iewInfo.stall ||
-            fromCommit->commitInfo.stall)
-        {
-            DPRINTF(Fetch, "Fetch: Stalling stage.\n");
-            DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i "
-                    "Commit: %i\n",
-                    fromDecode->decodeInfo.stall,
-                    fromRename->renameInfo.stall,
-                    fromIEW->iewInfo.stall,
-                    fromCommit->commitInfo.stall);
-        } else {
-
-            DPRINTF(Fetch, "Fetch: Done blocking.\n");
-            _status = Running;
-        }
-
-        if (fromCommit->commitInfo.squash) {
-            DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                    "from commit.\n");
-            squash(fromCommit->commitInfo.nextPC);
-            return;
-        } else if (fromDecode->decodeInfo.squash) {
-            DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                    "from decode.\n");
-            squash(fromDecode->decodeInfo.nextPC);
-            return;
-        } else if (fromCommit->commitInfo.robSquashing) {
-            DPRINTF(Fetch, "Fetch: ROB is still squashing.\n");
-            _status = Squashing;
-            return;
-        }
-    } else if (_status == Squashing) {
-        // If there are no squash signals then change back to running.
-        // Note that when a squash starts happening, commitInfo.squash will
-        // be high.  But if the squash is still in progress, then only
-        // commitInfo.robSquashing will be high.
-        if (!fromCommit->commitInfo.squash &&
-            !fromCommit->commitInfo.robSquashing) {
-
-            DPRINTF(Fetch, "Fetch: Done squashing.\n");
-            _status = Running;
-        } else if (fromCommit->commitInfo.squash) {
-            // If there's a new squash, then start squashing again.
-            squash(fromCommit->commitInfo.nextPC);
-        } else {
-            // Purely a debugging statement.
-            DPRINTF(Fetch, "Fetch: ROB still squashing.\n");
-        }
    }
-#endif
 }

 template<class Impl>
@ -351,13 +462,6 @@ SimpleFetch<Impl>::fetch()
    // Start actual fetch
    //////////////////////////////////////////

-#ifdef FULL_SYSTEM
-    // Flag to say whether or not address is physical addr.
-    unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
-#else
-    unsigned flags = 0;
-#endif // FULL_SYSTEM
-
    // The current PC.
    Addr fetch_PC = cpu->readPC();

@ -379,64 +483,14 @@ SimpleFetch<Impl>::fetch()
                       "instruction, starting at PC %08p.\n",
                fetch_PC);

-        // Otherwise check if the instruction exists within the cache.
-        // If it does, then proceed on to read the instruction and the rest
-        // of the instructions in the cache line until either the end of the
-        // cache line or a predicted taken branch is encountered.
-        // Note that this simply checks if the first instruction exists
-        // within the cache, assuming the rest of the cache line also exists
-        // within the cache.
+        fault = fetchCacheLine(fetch_PC);
+    }

-        // Setup the memReq to do a read of the first isntruction's address.
-        // Set the appropriate read size and flags as well.
-        memReq->cmd = Read;
-        memReq->reset(fetch_PC, instSize, flags);
-
-        // Translate the instruction request.
-        // Should this function be
-        // in the CPU class ?  Probably...ITB/DTB should exist within the
-        // CPU.
-
-        fault = cpu->translateInstReq(memReq);
-
-        // In the case of faults, the fetch stage may need to stall and wait
-        // on what caused the fetch (ITB or Icache miss).
-
-        // If translation was successful, attempt to read the first
-        // instruction.
-        if (fault == No_Fault) {
-            DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
-            fault = cpu->mem->read(memReq, inst);
-            // This read may change when the mem interface changes.
-        }
-
-        // Now do the timing access to see whether or not the instruction
-        // exists within the cache.
-        if (icacheInterface && fault == No_Fault) {
-            DPRINTF(Fetch, "Fetch: Doing timing memory access.\n");
-            memReq->completionEvent = NULL;
-
-            memReq->time = curTick;
-
-            MemAccessResult result = icacheInterface->access(memReq);
-
-            // If the cache missed (in this model functional and timing
-            // memories are different), then schedule an event to wake
-            // up this stage once the cache miss completes.
-            if (result != MA_HIT && icacheInterface->doEvents()) {
-                memReq->completionEvent = &cacheCompletionEvent;
-//        	    lastIcacheStall = curTick;
-
-                // How does current model work as far as individual
-                // stages scheduling/unscheduling?
-                // Perhaps have only the main CPU scheduled/unscheduled,
-                // and have it choose what stages to run appropriately.
-
-                DPRINTF(Fetch, "Fetch: Stalling due to icache miss.\n");
-                _status = IcacheMissStall;
-                return;
-            }
-        }
+    // If we had a stall due to an icache miss, then return.  It'd
+    // be nicer if this were handled through the kind of fault that
+    // is returned by the function.
+    if (_status == IcacheMissStall) {
+        return;
    }

    // As far as timing goes, the CPU will need to send an event through
@ -446,11 +500,15 @@ SimpleFetch<Impl>::fetch()

    Addr next_PC = fetch_PC;
    InstSeqNum inst_seq;
+    MachInst inst;
+    unsigned offset = fetch_PC & cacheBlkMask;
+    unsigned fetched;

-    // If the read of the first instruction was successful, then grab the
-    // instructions from the rest of the cache line and put them into the
-    // queue heading to decode.
    if (fault == No_Fault) {
+        // If the read of the first instruction was successful, then grab the
+        // instructions from the rest of the cache line and put them into the
+        // queue heading to decode.
+
        DPRINTF(Fetch, "Fetch: Adding instructions to queue to decode.\n");

        //////////////////////////
@ -461,124 +519,59 @@ SimpleFetch<Impl>::fetch()
        // ended this fetch block.
        bool predicted_branch = false;

-        // Might want to keep track of various stats.
-//        numLinesFetched++;
-
-        // Get a sequence number.
-        inst_seq = cpu->getAndIncrementInstSeq();
-
-        // Update the next PC; it either is PC+sizeof(MachInst), or
-        // branch_target.  Check whether or not a branch was taken.
-        predicted_branch = lookupAndUpdateNextPC(next_PC);
-
-        // Because the first instruction was already fetched, create the
-        // DynInst and put it into the queue to decode.
-        DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
-                                             inst_seq, cpu);
-
-        DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                inst_seq, instruction->readPC());
-        DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
-                OPCODE(inst));
-
-        instruction->traceData =
-            Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
-                                 instruction->staticInst,
-                                 instruction->readPC(), 0);
-
-        cpu->addInst(instruction);
-
-        // Write the instruction to the first slot in the queue
-        // that heads to decode.
-        toDecode->insts[0] = instruction;
-
-        toDecode->size++;
-
-        fetch_PC = next_PC;
-
-        //////////////////////////
-        // Fetch other instructions
-        //////////////////////////
-
-        // Obtain the index into the cache line by getting only the low
-        // order bits.  Will need to do shifting as well.
-        int line_index = fetch_PC & cacheBlockMask;
-
-        // Take instructions and put them into the queue heading to decode.
-        // Then read the next instruction in the cache line.  Continue
-        // until either all of the fetch bandwidth is used (not an issue for
-        // non-SMT), or the end of the cache line is reached.  Note that
-        // this assumes standard cachelines, and not something like a trace
-        // cache where lines might not end at cache-line size aligned
-        // addresses.
-        // @todo: Fix the horrible amount of translates/reads that must
-        // take place due to reading an entire cacheline.  Ideally it
-        // should all take place at once, return an array of binary
-        // instructions, which can then be used to get all the instructions
-        // needed.  Figure out if I can roll it back into one loop.
-        for (int fetched = 1;
-             line_index < blkSize &&
+        for (fetched = 0;
+             offset < cacheBlkSize &&
                 fetched < fetchWidth &&
                 !predicted_branch;
-             line_index+=instSize, ++fetched)
+             ++fetched)
        {
-            // Reset the mem request to setup the read of the next
-            // instruction.
-            memReq->reset(fetch_PC, instSize, flags);
-
-            // Translate the instruction request.
-            fault = cpu->translateInstReq(memReq);
-
-            // Read instruction.
-            if (fault == No_Fault) {
-                fault = cpu->mem->read(memReq, inst);
-            }
-
-            // Check if there was a fault.
-            if (fault != No_Fault) {
-                panic("Fetch: Read of instruction faulted when it should "
-                      "succeed; most likely exceeding cache line.\n");
-            }

            // Get a sequence number.
            inst_seq = cpu->getAndIncrementInstSeq();

-            predicted_branch = lookupAndUpdateNextPC(next_PC);
+            // Make sure this is a valid index.
+            assert(offset <= cacheBlkSize - instSize);

-            // Create the actual DynInst.  Parameters are:
-            // DynInst(instruction, PC, predicted PC, CPU pointer).
-            // Because this simple model has no branch prediction, the
-            // predicted PC will simply be PC+sizeof(MachInst).
-            // Update to actually use a branch predictor to predict the
-            // target in the future.
-            DynInstPtr instruction =
-                new DynInst(inst, fetch_PC, next_PC, inst_seq, cpu);
+            // Get the instruction from the array of the cache line.
+            inst = htoa(*reinterpret_cast<MachInst *>
+                        (&cacheData[offset]));
+
+            // Create a new DynInst from the instruction fetched.
+            DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
+                                                 inst_seq, cpu);
+
+            DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
+                    inst_seq, instruction->readPC());
+
+            DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
+                    OPCODE(inst));

            instruction->traceData =
                Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
                                     instruction->staticInst,
                                     instruction->readPC(), 0);

-            DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                    inst_seq, instruction->readPC());
-            DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
-                    OPCODE(inst));
+            predicted_branch = lookupAndUpdateNextPC(instruction, next_PC);

+            // Add instruction to the CPU's list of instructions.
            cpu->addInst(instruction);

-            // Write the instruction to the proper slot in the queue
+            // Write the instruction to the first slot in the queue
            // that heads to decode.
            toDecode->insts[fetched] = instruction;

            toDecode->size++;

-            // Might want to keep track of various stats.
-//             numInstsFetched++;
+            // Increment stat of fetched instructions.
+            ++fetchedInsts;

-            // Update the PC with the next PC.
+            // Move to the next instruction, unless we have a branch.
            fetch_PC = next_PC;
+
+            offset+= instSize;
        }

+        fetch_nisn_dist.sample(fetched);
    }

    // Now that fetching is completed, update the PC to signify what the next
@ -592,6 +585,12 @@ SimpleFetch<Impl>::fetch()
        cpu->setPC(next_PC);
        cpu->setNextPC(next_PC + instSize);
    } else {
+        // If the issue was an icache miss, then we can just return and
+        // wait until it is handled.
+        if (_status == IcacheMissStall) {
+            return;
+        }
+
        // Handle the fault.
        // This stage will not be able to continue until all the ROB
        // slots are empty, at which point the fault can be handled.
--- a/cpu/beta_cpu/free_list.hh
+++ b/cpu/beta_cpu/free_list.hh
@ -6,11 +6,9 @@

 #include "arch/alpha/isa_traits.hh"
 #include "cpu/beta_cpu/comm.hh"
+#include "base/traceflags.hh"
 #include "base/trace.hh"

-// Question: Do I even need the number of logical registers?
-// How to avoid freeing registers instantly?  Same with ROB entries.
-
 /**
 * FreeList class that simply holds the list of free integer and floating
 * point registers.  Can request for a free register of either type, and
@ -153,8 +151,6 @@ SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
    assert(!freeIntRegsScoreboard[freed_reg]);
    freeIntRegsScoreboard[freed_reg] = 1;

-    //Might want to add in a check for whether or not this register is
-    //already in there.  A bit vector or something similar would be useful.
    freeIntRegs.push(freed_reg);
 }

@ -167,8 +163,6 @@ SimpleFreeList::addFloatReg(PhysRegIndex freed_reg)
    assert(!freeFloatRegsScoreboard[freed_reg]);
    freeFloatRegsScoreboard[freed_reg] = 1;

-    //Might want to add in a check for whether or not this register is
-    //already in there.  A bit vector or something similar would be useful.
    freeFloatRegs.push(freed_reg);
 }

--- a/cpu/beta_cpu/full_cpu.cc
+++ b/cpu/beta_cpu/full_cpu.cc
@ -166,6 +166,13 @@ FullBetaCPU<Impl>::~FullBetaCPU()
 {
 }

+template <class Impl>
+void
+FullBetaCPU<Impl>::fullCPURegStats()
+{
+    // Register any of the FullCPU's stats here.
+}
+
 template <class Impl>
 void
 FullBetaCPU<Impl>::tick()
@ -424,19 +431,17 @@ template <class Impl>
 void
 FullBetaCPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    DynInstPtr inst_to_delete;
+    DynInstPtr inst_to_remove;

-    // The front instruction should be the same one being asked to be deleted.
+    // The front instruction should be the same one being asked to be removed.
    assert(instList.front() == inst);

    // Remove the front instruction.
-    inst_to_delete = inst;
+    inst_to_remove = inst;
    instList.pop_front();

-    DPRINTF(FullCPU, "FullCPU: Deleting committed instruction %#x, PC %#x\n",
-            inst_to_delete, inst_to_delete->readPC());
-
-//    delete inst_to_delete;
+    DPRINTF(FullCPU, "FullCPU: Removing committed instruction %#x, PC %#x\n",
+            inst_to_remove, inst_to_remove->readPC());
 }

 template <class Impl>
@ -451,6 +456,33 @@ FullBetaCPU<Impl>::removeInstsNotInROB()
    removeBackInst(rob_tail);
 }

+template <class Impl>
+void
+FullBetaCPU<Impl>::removeInstsUntil(const InstSeqNum &seq_num)
+{
+    DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
+            "list.\n");
+
+    DynInstPtr inst_to_delete;
+
+    while (instList.back()->seqNum > seq_num) {
+        assert(!instList.empty());
+
+        // Obtain the pointer to the instruction.
+        inst_to_delete = instList.back();
+
+        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
+                inst_to_delete->seqNum, inst_to_delete->readPC());
+
+        // Remove the instruction from the list.
+        instList.pop_back();
+
+        // Mark it as squashed.
+        inst_to_delete->setSquashed();
+    }
+
+}
+
 template <class Impl>
 void
 FullBetaCPU<Impl>::removeAllInsts()
--- a/cpu/beta_cpu/full_cpu.hh
+++ b/cpu/beta_cpu/full_cpu.hh
@ -115,6 +115,8 @@ class FullBetaCPU : public BaseFullCPU

    void init();

+    void fullCPURegStats();
+
    void activateContext(int thread_num, int delay);
    void suspendContext(int thread_num);
    void deallocateContext(int thread_num);
@ -205,6 +207,9 @@ class FullBetaCPU : public BaseFullCPU
    /** Remove all instructions that are not currently in the ROB. */
    void removeInstsNotInROB();

+    /** Remove all instructions younger than the given sequence number. */
+    void removeInstsUntil(const InstSeqNum &seq_num);
+
    /** Remove all instructions from the list. */
    void removeAllInsts();

--- a/cpu/beta_cpu/iew.hh
+++ b/cpu/beta_cpu/iew.hh
@ -9,6 +9,7 @@

 #include "base/timebuf.hh"
 #include "cpu/beta_cpu/comm.hh"
+#include "base/statistics.hh"

 //Can IEW even stall?  Space should be available/allocated already...maybe
 //if there's not enough write ports on the ROB or waiting for CDB
@ -50,7 +51,9 @@ class SimpleIEW
  public:
    void squash();

-    void squash(DynInstPtr &inst);
+    void squashDueToBranch(DynInstPtr &inst);
+
+    void squashDueToMem(DynInstPtr &inst);

    void block();

@ -59,6 +62,8 @@ class SimpleIEW
  public:
    SimpleIEW(Params &params);

+    void regStats();
+
    void setCPU(FullCPU *cpu_ptr);

    void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@ -76,6 +81,10 @@ class SimpleIEW
    void iew();

  private:
+    void dispatchInsts();
+
+    void executeInsts();
+
    //Interfaces to objects inside and outside of IEW.
    /** Time buffer interface. */
    TimeBuffer<TimeStruct> *timeBuffer;
@ -159,9 +168,23 @@ class SimpleIEW
     */
    unsigned cyclesSquashing;

-    //Will implement later
-    //Load queue interface (probably one and the same)
-    //Store queue interface
+    Stats::Scalar<> iewIdleCycles;
+    Stats::Scalar<> iewSquashCycles;
+    Stats::Scalar<> iewBlockCycles;
+    Stats::Scalar<> iewUnblockCycles;
+//    Stats::Scalar<> iewWBInsts;
+    Stats::Scalar<> iewDispatchedInsts;
+    Stats::Scalar<> iewDispSquashedInsts;
+    Stats::Scalar<> iewDispLoadInsts;
+    Stats::Scalar<> iewDispStoreInsts;
+    Stats::Scalar<> iewDispNonSpecInsts;
+    Stats::Scalar<> iewIQFullEvents;
+    Stats::Scalar<> iewExecutedInsts;
+    Stats::Scalar<> iewExecLoadInsts;
+    Stats::Scalar<> iewExecStoreInsts;
+    Stats::Scalar<> iewExecSquashedInsts;
+    Stats::Scalar<> memOrderViolationEvents;
+    Stats::Scalar<> predictedTakenIncorrect;
 };

 #endif
--- a/cpu/beta_cpu/iew_impl.hh
+++ b/cpu/beta_cpu/iew_impl.hh
@ -38,6 +38,79 @@ SimpleIEW<Impl, IQ>::SimpleIEW(Params &params)
    instQueue.setIssueToExecuteQueue(&issueToExecQueue);
 }

+template <class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::regStats()
+{
+    instQueue.regStats();
+
+    iewIdleCycles
+        .name(name() + ".iewIdleCycles")
+        .desc("Number of cycles IEW is idle");
+
+    iewSquashCycles
+        .name(name() + ".iewSquashCycles")
+        .desc("Number of cycles IEW is squashing");
+
+    iewBlockCycles
+        .name(name() + ".iewBlockCycles")
+        .desc("Number of cycles IEW is blocking");
+
+    iewUnblockCycles
+        .name(name() + ".iewUnblockCycles")
+        .desc("Number of cycles IEW is unblocking");
+
+//    iewWBInsts;
+
+    iewDispatchedInsts
+        .name(name() + ".iewDispatchedInsts")
+        .desc("Number of instructions dispatched to IQ");
+
+    iewDispSquashedInsts
+        .name(name() + ".iewDispSquashedInsts")
+        .desc("Number of squashed instructions skipped by dispatch");
+
+    iewDispLoadInsts
+        .name(name() + ".iewDispLoadInsts")
+        .desc("Number of dispatched load instructions");
+
+    iewDispStoreInsts
+        .name(name() + ".iewDispStoreInsts")
+        .desc("Number of dispatched store instructions");
+
+    iewDispNonSpecInsts
+        .name(name() + ".iewDispNonSpecInsts")
+        .desc("Number of dispatched non-speculative instructions");
+
+    iewIQFullEvents
+        .name(name() + ".iewIQFullEvents")
+        .desc("Number of times the IQ has become full, causing a stall");
+
+    iewExecutedInsts
+        .name(name() + ".iewExecutedInsts")
+        .desc("Number of executed instructions");
+
+    iewExecLoadInsts
+        .name(name() + ".iewExecLoadInsts")
+        .desc("Number of load instructions executed");
+
+    iewExecStoreInsts
+        .name(name() + ".iewExecStoreInsts")
+        .desc("Number of store instructions executed");
+
+    iewExecSquashedInsts
+        .name(name() + ".iewExecSquashedInsts")
+        .desc("Number of squashed instructions skipped in execute");
+
+    memOrderViolationEvents
+        .name(name() + ".memOrderViolationEvents")
+        .desc("Number of memory order violations");
+
+    predictedTakenIncorrect
+        .name(name() + ".predictedTakenIncorrect")
+        .desc("Number of branches that were predicted taken incorrectly");
+}
+
 template<class Impl, class IQ>
 void
 SimpleIEW<Impl, IQ>::setCPU(FullCPU *cpu_ptr)
@ -158,7 +231,7 @@ SimpleIEW<Impl, IQ>::squash()

 template<class Impl, class IQ>
 void
-SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst)
+SimpleIEW<Impl, IQ>::squashDueToBranch(DynInstPtr &inst)
 {
    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
            inst->PC);
@ -167,14 +240,282 @@ SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst)
    _status = Squashing;

    // Tell rename to squash through the time buffer.
-    toRename->iewInfo.squash = true;
+    toCommit->squash = true;
    // Also send PC update information back to prior stages.
-    toRename->iewInfo.squashedSeqNum = inst->seqNum;
-    toRename->iewInfo.mispredPC = inst->readPC();
-    toRename->iewInfo.nextPC = inst->readCalcTarg();
-    toRename->iewInfo.branchMispredict = true;
+    toCommit->squashedSeqNum = inst->seqNum;
+    toCommit->mispredPC = inst->readPC();
+    toCommit->nextPC = inst->readCalcTarg();
+    toCommit->branchMispredict = true;
    // Prediction was incorrect, so send back inverse.
-    toRename->iewInfo.branchTaken = !(inst->predTaken());
+    toCommit->branchTaken = inst->readCalcTarg() !=
+        (inst->readPC() + sizeof(MachInst));
+//    toCommit->globalHist = inst->readGlobalHist();
+}
+
+template<class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::squashDueToMem(DynInstPtr &inst)
+{
+    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
+            inst->PC);
+    // Perhaps leave the squashing up to the ROB stage to tell it when to
+    // squash?
+    _status = Squashing;
+
+    // Tell rename to squash through the time buffer.
+    toCommit->squash = true;
+    // Also send PC update information back to prior stages.
+    toCommit->squashedSeqNum = inst->seqNum;
+    toCommit->nextPC = inst->readCalcTarg();
+}
+
+template <class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::dispatchInsts()
+{
+    ////////////////////////////////////////
+    // DISPATCH/ISSUE stage
+    ////////////////////////////////////////
+
+    //Put into its own function?
+    //Add instructions to IQ if there are any instructions there
+
+    // Check if there are any instructions coming from rename, and we're.
+    // not squashing.
+    if (fromRename->size > 0) {
+        int insts_to_add = fromRename->size;
+
+        // Loop through the instructions, putting them in the instruction
+        // queue.
+        for (int inst_num = 0; inst_num < insts_to_add; ++inst_num)
+        {
+            DynInstPtr inst = fromRename->insts[inst_num];
+
+            // Make sure there's a valid instruction there.
+            assert(inst);
+
+            DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n",
+                    inst->readPC());
+
+            // Be sure to mark these instructions as ready so that the
+            // commit stage can go ahead and execute them, and mark
+            // them as issued so the IQ doesn't reprocess them.
+            if (inst->isSquashed()) {
+                ++iewDispSquashedInsts;
+                continue;
+            } else if (instQueue.isFull()) {
+                DPRINTF(IEW, "IEW: Issue: IQ has become full.\n");
+                // Call function to start blocking.
+                block();
+                // Tell previous stage to stall.
+                toRename->iewInfo.stall = true;
+
+                ++iewIQFullEvents;
+                break;
+            } else if (inst->isLoad()) {
+                DPRINTF(IEW, "IEW: Issue: Memory instruction "
+                        "encountered, adding to LDSTQ.\n");
+
+                // Reserve a spot in the load store queue for this
+                // memory access.
+                ldstQueue.insertLoad(inst);
+
+                ++iewDispLoadInsts;
+            } else if (inst->isStore()) {
+                ldstQueue.insertStore(inst);
+
+                // A bit of a hack.  Set that it can commit so that
+                // the commit stage will try committing it, and then
+                // once commit realizes it's a store it will send back
+                // a signal to this stage to issue and execute that
+                // store.  Change to be a bit that says the instruction
+                // has extra work to do at commit.
+                inst->setCanCommit();
+
+                instQueue.insertNonSpec(inst);
+
+                ++iewDispStoreInsts;
+                ++iewDispNonSpecInsts;
+
+                continue;
+            } else if (inst->isNonSpeculative()) {
+                DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction "
+                        "encountered, skipping.\n");
+
+                // Same hack as with stores.
+                inst->setCanCommit();
+
+                // Specificall insert it as nonspeculative.
+                instQueue.insertNonSpec(inst);
+
+                ++iewDispNonSpecInsts;
+
+                continue;
+            } else if (inst->isNop()) {
+                DPRINTF(IEW, "IEW: Issue: Nop instruction encountered "
+                        ", skipping.\n");
+
+                inst->setIssued();
+                inst->setExecuted();
+                inst->setCanCommit();
+
+                instQueue.advanceTail(inst);
+
+                continue;
+            } else if (inst->isExecuted()) {
+                DPRINTF(IEW, "IEW: Issue: Executed branch encountered, "
+                        "skipping.\n");
+
+                assert(inst->isDirectCtrl());
+
+                inst->setIssued();
+                inst->setCanCommit();
+
+                instQueue.advanceTail(inst);
+
+                continue;
+            }
+
+            // If the instruction queue is not full, then add the
+            // instruction.
+            instQueue.insert(fromRename->insts[inst_num]);
+
+            ++iewDispatchedInsts;
+        }
+    }
+}
+
+template <class Impl, class IQ>
+void
+SimpleIEW<Impl, IQ>::executeInsts()
+{
+    ////////////////////////////////////////
+    //EXECUTE/WRITEBACK stage
+    ////////////////////////////////////////
+
+    //Put into its own function?
+    //Similarly should probably have separate execution for int vs FP.
+    // Above comment is handled by the issue queue only issuing a valid
+    // mix of int/fp instructions.
+    //Actually okay to just have one execution, buuuuuut will need
+    //somewhere that defines the execution latency of all instructions.
+    // @todo: Move to the FU pool used in the current full cpu.
+
+    int fu_usage = 0;
+    bool fetch_redirect = false;
+
+    // Execute/writeback any instructions that are available.
+    for (int inst_num = 0;
+         fu_usage < executeWidth && /* Haven't exceeded available FU's. */
+             inst_num < issueWidth &&
+             fromIssue->insts[inst_num];
+         ++inst_num) {
+
+        DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n");
+
+        // Get instruction from issue's queue.
+        DynInstPtr inst = fromIssue->insts[inst_num];
+
+        DPRINTF(IEW, "IEW: Execute: Processing PC %#x.\n", inst->readPC());
+
+        // Check if the instruction is squashed; if so then skip it
+        // and don't count it towards the FU usage.
+        if (inst->isSquashed()) {
+            DPRINTF(IEW, "IEW: Execute: Instruction was squashed.\n");
+
+            // Consider this instruction executed so that commit can go
+            // ahead and retire the instruction.
+            inst->setExecuted();
+
+            toCommit->insts[inst_num] = inst;
+
+            ++iewExecSquashedInsts;
+
+            continue;
+        }
+
+        inst->setExecuted();
+
+        // If an instruction is executed, then count it towards FU usage.
+        ++fu_usage;
+
+        // Execute instruction.
+        // Note that if the instruction faults, it will be handled
+        // at the commit stage.
+        if (inst->isMemRef()) {
+            DPRINTF(IEW, "IEW: Execute: Calculating address for memory "
+                    "reference.\n");
+
+            // Tell the LDSTQ to execute this instruction (if it is a load).
+            if (inst->isLoad()) {
+                ldstQueue.executeLoad(inst);
+
+                ++iewExecLoadInsts;
+            } else if (inst->isStore()) {
+                ldstQueue.executeStore();
+
+                ++iewExecStoreInsts;
+            } else {
+                panic("IEW: Unexpected memory type!\n");
+            }
+
+        } else {
+            inst->execute();
+
+            ++iewExecutedInsts;
+        }
+
+        // First check the time slot that this instruction will write
+        // to.  If there are free write ports at the time, then go ahead
+        // and write the instruction to that time.  If there are not,
+        // keep looking back to see where's the first time there's a
+        // free slot.  What happens if you run out of free spaces?
+        // For now naively assume that all instructions take one cycle.
+        // Otherwise would have to look into the time buffer based on the
+        // latency of the instruction.
+
+        // Add finished instruction to queue to commit.
+        toCommit->insts[inst_num] = inst;
+
+        // Check if branch was correct.  This check happens after the
+        // instruction is added to the queue because even if the branch
+        // is mispredicted, the branch instruction itself is still valid.
+        // Only handle this if there hasn't already been something that
+        // redirects fetch in this group of instructions.
+        if (!fetch_redirect) {
+            if (inst->mispredicted()) {
+                fetch_redirect = true;
+
+                DPRINTF(IEW, "IEW: Execute: Branch mispredict detected.\n");
+                DPRINTF(IEW, "IEW: Execute: Redirecting fetch to PC: %#x.\n",
+                        inst->nextPC);
+
+                // If incorrect, then signal the ROB that it must be squashed.
+                squashDueToBranch(inst);
+
+                if (inst->predTaken()) {
+                    predictedTakenIncorrect++;
+                }
+            } else if (ldstQueue.violation()) {
+                fetch_redirect = true;
+
+                // Get the DynInst that caused the violation.
+                DynInstPtr violator = ldstQueue.getMemDepViolator();
+
+                DPRINTF(IEW, "IEW: LDSTQ detected a violation.  Violator PC: "
+                        "%#x, inst PC: %#x.  Addr is: %#x.\n",
+                        violator->readPC(), inst->readPC(), inst->physEffAddr);
+
+                // Tell the instruction queue that a violation has occured.
+                instQueue.violation(inst, violator);
+
+                // Squash.
+                squashDueToMem(inst);
+
+                ++memOrderViolationEvents;
+            }
+        }
+    }
 }

 template<class Impl, class IQ>
@ -198,6 +539,8 @@ SimpleIEW<Impl, IQ>::tick()
        // to running.
        if (_status == Unblocking) {
            unblock();
+
+            ++iewUnblockCycles;
        }
    } else if (_status == Squashing) {

@ -216,6 +559,8 @@ SimpleIEW<Impl, IQ>::tick()
            instQueue.doSquash();
        }

+        ++iewSquashCycles;
+
        // Also should advance its own time buffers if the stage ran.
        // Not sure about this...
 //        issueToExecQueue.advance();
@ -232,7 +577,7 @@ SimpleIEW<Impl, IQ>::tick()

        // If there's still instructions coming from rename, continue to
        // put them on the skid buffer.
-        if (fromRename->insts[0]) {
+        if (fromRename->size == 0) {
            block();
        }

@ -240,6 +585,8 @@ SimpleIEW<Impl, IQ>::tick()
            fromCommit->commitInfo.robSquashing) {
            squash();
        }
+
+        ++iewBlockCycles;
    }

    // @todo: Maybe put these at the beginning, so if it's idle it can
@ -280,209 +627,12 @@ SimpleIEW<Impl, IQ>::iew()
        return;
    }

-    ////////////////////////////////////////
-    // DISPATCH/ISSUE stage
-    ////////////////////////////////////////
-
-    //Put into its own function?
-    //Add instructions to IQ if there are any instructions there
-
-    // Check if there are any instructions coming from rename, and we're.
-    // not squashing.
-    if (fromRename->insts[0] && _status != Squashing) {
-
-        // Loop through the instructions, putting them in the instruction
-        // queue.
-        for (int inst_num = 0; inst_num < issueReadWidth; ++inst_num)
-        {
-            DynInstPtr inst = fromRename->insts[inst_num];
-
-            // Make sure there's a valid instruction there.
-            if (!inst)
-                break;
-
-            DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n",
-                    inst->readPC());
-
-            // If it's a memory reference, don't put it in the
-            // instruction queue.  These will only be executed at commit.
-            // Do the same for nonspeculative instructions and nops.
-            // Be sure to mark these instructions as ready so that the
-            // commit stage can go ahead and execute them, and mark
-            // them as issued so the IQ doesn't reprocess them.
-            if (inst->isSquashed()) {
-                continue;
-            } else if (inst->isLoad()) {
-                DPRINTF(IEW, "IEW: Issue: Memory instruction "
-                        "encountered, adding to LDSTQ.\n");
-
-                // Reserve a spot in the load store queue for this
-                // memory access.
-                ldstQueue.insertLoad(inst);
-
-            } else if (inst->isStore()) {
-                ldstQueue.insertStore(inst);
-
-                // A bit of a hack.  Set that it can commit so that
-                // the commit stage will try committing it, and then
-                // once commit realizes it's a store it will send back
-                // a signal to this stage to issue and execute that
-                // store.
-                inst->setCanCommit();
-
-                instQueue.insertNonSpec(inst);
-                continue;
-            } else if (inst->isNonSpeculative()) {
-                DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction "
-                        "encountered, skipping.\n");
-
-                // Same hack as with stores.
-                inst->setCanCommit();
-
-                // Specificall insert it as nonspeculative.
-                instQueue.insertNonSpec(inst);
-
-                continue;
-            } else if (inst->isNop()) {
-                DPRINTF(IEW, "IEW: Issue: Nop instruction encountered "
-                        ", skipping.\n");
-
-                inst->setIssued();
-                inst->setExecuted();
-                inst->setCanCommit();
-
-                instQueue.advanceTail(inst);
-                continue;
-            } else if (instQueue.isFull()) {
-                DPRINTF(IEW, "IEW: Issue: IQ has become full.\n");
-                // Call function to start blocking.
-                block();
-                // Tell previous stage to stall.
-                toRename->iewInfo.stall = true;
-                break;
-            }
-
-            // If the instruction queue is not full, then add the
-            // instruction.
-            instQueue.insert(fromRename->insts[inst_num]);
-        }
-    }
+    dispatchInsts();

    // Have the instruction queue try to schedule any ready instructions.
    instQueue.scheduleReadyInsts();

-    ////////////////////////////////////////
-    //EXECUTE/WRITEBACK stage
-    ////////////////////////////////////////
-
-    //Put into its own function?
-    //Similarly should probably have separate execution for int vs FP.
-    // Above comment is handled by the issue queue only issuing a valid
-    // mix of int/fp instructions.
-    //Actually okay to just have one execution, buuuuuut will need
-    //somewhere that defines the execution latency of all instructions.
-    // @todo: Move to the FU pool used in the current full cpu.
-
-    int fu_usage = 0;
-    bool fetch_redirect = false;
-
-    // Execute/writeback any instructions that are available.
-    for (int inst_num = 0;
-         fu_usage < executeWidth && /* Haven't exceeded available FU's. */
-         inst_num < issueWidth && /* Haven't exceeded issue width. */
-         fromIssue->insts[inst_num]; /* There are available instructions. */
-         ++inst_num) {
-        DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n");
-
-        // Get instruction from issue's queue.
-        DynInstPtr inst = fromIssue->insts[inst_num];
-
-        DPRINTF(IEW, "IEW: Execute: Processing PC %#x.\n", inst->readPC());
-
-        // Check if the instruction is squashed; if so then skip it
-        // and don't count it towards the FU usage.
-        if (inst->isSquashed()) {
-            DPRINTF(IEW, "IEW: Execute: Instruction was squashed.\n");
-
-            // Consider this instruction executed so that commit can go
-            // ahead and retire the instruction.
-            inst->setExecuted();
-
-            toCommit->insts[inst_num] = inst;
-
-            continue;
-        }
-
-        inst->setExecuted();
-
-        // If an instruction is executed, then count it towards FU usage.
-        ++fu_usage;
-
-        // Execute instruction.
-        // Note that if the instruction faults, it will be handled
-        // at the commit stage.
-        if (inst->isMemRef()) {
-            DPRINTF(IEW, "IEW: Execute: Calculating address for memory "
-                    "reference.\n");
-
-            // Tell the LDSTQ to execute this instruction (if it is a load).
-            if (inst->isLoad()) {
-                ldstQueue.executeLoad(inst);
-            } else if (inst->isStore()) {
-                ldstQueue.executeStore();
-            } else {
-                panic("IEW: Unexpected memory type!\n");
-            }
-
-        } else {
-            inst->execute();
-        }
-
-        // First check the time slot that this instruction will write
-        // to.  If there are free write ports at the time, then go ahead
-        // and write the instruction to that time.  If there are not,
-        // keep looking back to see where's the first time there's a
-        // free slot.  What happens if you run out of free spaces?
-        // For now naively assume that all instructions take one cycle.
-        // Otherwise would have to look into the time buffer based on the
-        // latency of the instruction.
-
-        // Add finished instruction to queue to commit.
-        toCommit->insts[inst_num] = inst;
-
-        // Check if branch was correct.  This check happens after the
-        // instruction is added to the queue because even if the branch
-        // is mispredicted, the branch instruction itself is still valid.
-        // Only handle this if there hasn't already been something that
-        // redirects fetch in this group of instructions.
-        if (!fetch_redirect) {
-            if (inst->mispredicted()) {
-                fetch_redirect = true;
-
-                DPRINTF(IEW, "IEW: Execute: Branch mispredict detected.\n");
-                DPRINTF(IEW, "IEW: Execute: Redirecting fetch to PC: %#x.\n",
-                        inst->nextPC);
-
-                // If incorrect, then signal the ROB that it must be squashed.
-                squash(inst);
-            } else if (ldstQueue.violation()) {
-                fetch_redirect = true;
-
-                DynInstPtr violator = ldstQueue.getMemDepViolator();
-
-                DPRINTF(IEW, "IEW: LDSTQ detected a violation.  Violator PC: "
-                        "%#x, inst PC: %#x.  Addr is: %#x.\n",
-                        violator->readPC(), inst->readPC(), inst->physEffAddr);
-
-                instQueue.violation(inst, violator);
-
-                squash(inst);
-                // Otherwise check if there was a memory ordering violation.
-                // If there was, then signal ROB that it must be squashed.  Also
-                // signal IQ that there was a violation.
-            }
-        }
-    }
+    executeInsts();

    // Loop through the head of the time buffer and wake any dependents.
    // These instructions are about to write back.  In the simple model
@ -491,7 +641,7 @@ SimpleIEW<Impl, IQ>::iew()
    // Also mark scoreboard that this instruction is finally complete.
    // Either have IEW have direct access to rename map, or have this as
    // part of backwards communication.
-    for (int inst_num = 0; inst_num < executeWidth &&
+    for (int inst_num = 0; inst_num < issueWidth &&
             toCommit->insts[inst_num]; inst_num++)
    {
        DynInstPtr inst = toCommit->insts[inst_num];
--- a/cpu/beta_cpu/inst_queue.cc
+++ b/cpu/beta_cpu/inst_queue.cc
@ -5,3 +5,6 @@

 // Force instantiation of InstructionQueue.
 template InstructionQueue<AlphaSimpleImpl>;
+
+unsigned
+InstructionQueue<AlphaSimpleImpl>::DependencyEntry::mem_alloc_counter = 0;
--- a/cpu/beta_cpu/inst_queue.hh
+++ b/cpu/beta_cpu/inst_queue.hh
@ -7,14 +7,10 @@
 #include <stdint.h>
 #include <vector>

+#include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/inst_seq.hh"

-//Perhaps have a better separation between the data structure underlying
-//and the actual algorithm.
-//somewhat nasty to try to have a nice ordering.
-// Consider moving to STL list or slist for the LL stuff.
-
 /**
 * A standard instruction queue class.  It holds instructions in an
 * array, holds the ordering of the instructions within a linked list,
@ -74,6 +70,8 @@ class InstructionQueue

    InstructionQueue(Params &params);

+    void regStats();
+
    void setCPU(FullCPU *cpu);

    void setIssueToExecuteQueue(TimeBuffer<IssueStruct> *i2eQueue);
@ -98,6 +96,7 @@ class InstructionQueue

    void violation(DynInstPtr &store, DynInstPtr &faulting_load);

+    // Change this to take in the sequence number
    void squash();

    void doSquash();
@ -159,7 +158,7 @@ class InstructionQueue
    ReadyInstQueue readyBranchInsts;

    /** List of ready memory instructions. */
-    ReadyInstQueue readyMemInsts;
+//    ReadyInstQueue readyMemInsts;

    /** List of ready miscellaneous instructions. */
    ReadyInstQueue readyMiscInsts;
@ -228,9 +227,6 @@ class InstructionQueue
    /** The sequence number of the squashed instruction. */
    InstSeqNum squashedSeqNum;

-    /** Iterator that points to the oldest instruction in the IQ. */
-//    ListIt head;
-
    /** Iterator that points to the youngest instruction in the IQ. */
    ListIt tail;

@ -261,6 +257,9 @@ class InstructionQueue
        void insert(DynInstPtr &new_inst);

        void remove(DynInstPtr &inst_to_remove);
+
+        // Debug variable, remove when done testing.
+        static unsigned mem_alloc_counter;
    };

    /** Array of linked lists.  Each linked list is a list of all the
@ -285,6 +284,25 @@ class InstructionQueue
    void dumpDependGraph();

    void addIfReady(DynInstPtr &inst);
+
+    Stats::Scalar<> iqInstsAdded;
+    Stats::Scalar<> iqNonSpecInstsAdded;
+//    Stats::Scalar<> iqIntInstsAdded;
+    Stats::Scalar<> iqIntInstsIssued;
+//    Stats::Scalar<> iqFloatInstsAdded;
+    Stats::Scalar<> iqFloatInstsIssued;
+//    Stats::Scalar<> iqBranchInstsAdded;
+    Stats::Scalar<> iqBranchInstsIssued;
+//    Stats::Scalar<> iqMemInstsAdded;
+    Stats::Scalar<> iqMemInstsIssued;
+//    Stats::Scalar<> iqMiscInstsAdded;
+    Stats::Scalar<> iqMiscInstsIssued;
+    Stats::Scalar<> iqSquashedInstsIssued;
+    Stats::Scalar<> iqLoopSquashStalls;
+    Stats::Scalar<> iqSquashedInstsExamined;
+    Stats::Scalar<> iqSquashedOperandsExamined;
+    Stats::Scalar<> iqSquashedNonSpecRemoved;
+
 };

 #endif //__INST_QUEUE_HH__
--- a/cpu/beta_cpu/inst_queue_impl.hh
+++ b/cpu/beta_cpu/inst_queue_impl.hh
@ -24,15 +24,13 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
      numEntries(params.numIQEntries),
      intWidth(params.executeIntWidth),
      floatWidth(params.executeFloatWidth),
+      branchWidth(params.executeBranchWidth),
+      memoryWidth(params.executeMemoryWidth),
      totalWidth(params.issueWidth),
      numPhysIntRegs(params.numPhysIntRegs),
      numPhysFloatRegs(params.numPhysFloatRegs),
      commitToIEWDelay(params.commitToIEWDelay)
 {
-    // HACK: HARDCODED NUMBER.  REMOVE LATER AND ADD TO PARAMETER.
-    branchWidth = 1;
-    memoryWidth = 1;
-
    DPRINTF(IQ, "IQ: Int width is %i.\n", params.executeIntWidth);

    // Initialize the number of free IQ entries.
@ -66,6 +64,87 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)

 }

+template <class Impl>
+void
+InstructionQueue<Impl>::regStats()
+{
+    iqInstsAdded
+        .name(name() + ".iqInstsAdded")
+        .desc("Number of instructions added to the IQ (excludes non-spec)")
+        .prereq(iqInstsAdded);
+
+    iqNonSpecInstsAdded
+        .name(name() + ".iqNonSpecInstsAdded")
+        .desc("Number of non-speculative instructions added to the IQ")
+        .prereq(iqNonSpecInstsAdded);
+
+//    iqIntInstsAdded;
+
+    iqIntInstsIssued
+        .name(name() + ".iqIntInstsIssued")
+        .desc("Number of integer instructions issued")
+        .prereq(iqIntInstsIssued);
+
+//    iqFloatInstsAdded;
+
+    iqFloatInstsIssued
+        .name(name() + ".iqFloatInstsIssued")
+        .desc("Number of float instructions issued")
+        .prereq(iqFloatInstsIssued);
+
+//    iqBranchInstsAdded;
+
+    iqBranchInstsIssued
+        .name(name() + ".iqBranchInstsIssued")
+        .desc("Number of branch instructions issued")
+        .prereq(iqBranchInstsIssued);
+
+//    iqMemInstsAdded;
+
+    iqMemInstsIssued
+        .name(name() + ".iqMemInstsIssued")
+        .desc("Number of memory instructions issued")
+        .prereq(iqMemInstsIssued);
+
+//    iqMiscInstsAdded;
+
+    iqMiscInstsIssued
+        .name(name() + ".iqMiscInstsIssued")
+        .desc("Number of miscellaneous instructions issued")
+        .prereq(iqMiscInstsIssued);
+
+    iqSquashedInstsIssued
+        .name(name() + ".iqSquashedInstsIssued")
+        .desc("Number of squashed instructions issued")
+        .prereq(iqSquashedInstsIssued);
+
+    iqLoopSquashStalls
+        .name(name() + ".iqLoopSquashStalls")
+        .desc("Number of times issue loop had to restart due to squashed "
+              "inst; mainly for profiling")
+        .prereq(iqLoopSquashStalls);
+
+    iqSquashedInstsExamined
+        .name(name() + ".iqSquashedInstsExamined")
+        .desc("Number of squashed instructions iterated over during squash;"
+              " mainly for profiling")
+        .prereq(iqSquashedInstsExamined);
+
+    iqSquashedOperandsExamined
+        .name(name() + ".iqSquashedOperandsExamined")
+        .desc("Number of squashed operands that are examined and possibly "
+              "removed from graph")
+        .prereq(iqSquashedOperandsExamined);
+
+    iqSquashedNonSpecRemoved
+        .name(name() + ".iqSquashedNonSpecRemoved")
+        .desc("Number of squashed non-spec instructions that were removed")
+        .prereq(iqSquashedNonSpecRemoved);
+
+    // Tell mem dependence unit to reg stats as well.
+    memDepUnit.regStats();
+}
+
 template <class Impl>
 void
 InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr)
@ -161,10 +240,14 @@ InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
    // unit.
    if (new_inst->isMemRef()) {
        memDepUnit.insert(new_inst);
+        // Uh..forgot to look it up and put it on the proper dependency list
+        // if the instruction should not go yet.
+    } else {
+        // If the instruction is ready then add it to the ready list.
+        addIfReady(new_inst);
    }

-    // If the instruction is ready then add it to the ready list.
-    addIfReady(new_inst);
+    ++iqInstsAdded;

    assert(freeEntries == (numEntries - countInsts()));
 }
@ -219,13 +302,16 @@ InstructionQueue<Impl>::insertNonSpec(DynInstPtr &inst)
    // If it's a memory instruction, add it to the memory dependency
    // unit.
    if (inst->isMemRef()) {
-        memDepUnit.insert(inst);
+        memDepUnit.insertNonSpec(inst);
    }
+
+    ++iqNonSpecInstsAdded;
 }

 // Slightly hack function to advance the tail iterator in the case that
 // the IEW stage issues an instruction that is not added to the IQ.  This
 // is needed in case a long chain of such instructions occurs.
+// I don't think this is used anymore.
 template <class Impl>
 void
 InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
@ -288,7 +374,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
    bool insts_available = !readyBranchInsts.empty() ||
        !readyIntInsts.empty() ||
        !readyFloatInsts.empty() ||
-        !readyMemInsts.empty() ||
+        !memDepUnit.empty() ||
        !readyMiscInsts.empty() ||
        !squashedInsts.empty();

@ -327,6 +413,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()

            if (int_head_inst->isSquashed()) {
                readyIntInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                continue;
            }

@ -344,6 +433,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()

            if (float_head_inst->isSquashed()) {
                readyFloatInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                continue;
            } else if (float_head_inst->seqNum < oldest_inst) {
                oldest_inst = float_head_inst->seqNum;
@ -361,6 +453,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()

            if (branch_head_inst->isSquashed()) {
                readyBranchInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                continue;
            } else if (branch_head_inst->seqNum < oldest_inst) {
                oldest_inst = branch_head_inst->seqNum;
@ -370,15 +465,18 @@ InstructionQueue<Impl>::scheduleReadyInsts()

        }

-        if (!readyMemInsts.empty() &&
+        if (!memDepUnit.empty() &&
            memory_issued < memoryWidth) {

            insts_available = true;

-            mem_head_inst = readyMemInsts.top();
+            mem_head_inst = memDepUnit.top();

            if (mem_head_inst->isSquashed()) {
-                readyMemInsts.pop();
+                memDepUnit.pop();
+
+                ++iqLoopSquashStalls;
+
                continue;
            } else if (mem_head_inst->seqNum < oldest_inst) {
                oldest_inst = mem_head_inst->seqNum;
@ -395,6 +493,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()

            if (misc_head_inst->isSquashed()) {
                readyMiscInsts.pop();
+
+                ++iqLoopSquashStalls;
+
                continue;
            } else if (misc_head_inst->seqNum < oldest_inst) {
                oldest_inst = misc_head_inst->seqNum;
@ -450,9 +551,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
          case Memory:
            issuing_inst = mem_head_inst;

-            memDepUnit.issue(mem_head_inst);
-
-            readyMemInsts.pop();
+            memDepUnit.pop();
            ++memory_issued;
            DPRINTF(IQ, "IQ: Issuing memory instruction PC %#x.\n",
                    issuing_inst->readPC());
@ -461,6 +560,9 @@ InstructionQueue<Impl>::scheduleReadyInsts()
          case Misc:
            issuing_inst = misc_head_inst;
            readyMiscInsts.pop();
+
+            ++iqMiscInstsIssued;
+
            DPRINTF(IQ, "IQ: Issuing a miscellaneous instruction PC %#x.\n",
                    issuing_inst->readPC());
            break;
@ -476,6 +578,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()

        if (list_with_oldest != None) {
            i2e_info->insts[total_issued] = issuing_inst;
+            i2e_info->size++;

            issuing_inst->setIssued();

@ -485,12 +588,21 @@ InstructionQueue<Impl>::scheduleReadyInsts()

        assert(freeEntries == (numEntries - countInsts()));
    }
+
+    iqIntInstsIssued += int_issued;
+    iqFloatInstsIssued += float_issued;
+    iqBranchInstsIssued += branch_issued;
+    iqMemInstsIssued += memory_issued;
+    iqSquashedInstsIssued += squashed_issued;
 }

 template <class Impl>
 void
 InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
 {
+    DPRINTF(IQ, "IQ: Marking nonspeculative instruction with sequence "
+            "number %i as ready to execute.\n", inst);
+
    non_spec_it_t inst_it = nonSpecInsts.find(inst);

    assert(inst_it != nonSpecInsts.end());
@ -499,7 +611,11 @@ InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
    (*inst_it).second->setCanIssue();

    // Now schedule the instruction.
-    addIfReady((*inst_it).second);
+    if (!(*inst_it).second->isMemRef()) {
+        addIfReady((*inst_it).second);
+    } else {
+        memDepUnit.nonSpecInstReady((*inst_it).second);
+    }

    nonSpecInsts.erase(inst_it);
 }
@ -552,6 +668,7 @@ InstructionQueue<Impl>::doSquash()
        // hasn't already been squashed in the IQ.
        if (!squashed_inst->isIssued() &&
            !squashed_inst->isSquashedInIQ()) {
+
            // Remove the instruction from the dependency list.
            // Hack for now: These below don't add themselves to the
            // dependency list, so don't try to remove them.
@ -576,7 +693,15 @@ InstructionQueue<Impl>::doSquash()
                        src_reg < numPhysRegs) {
                        dependGraph[src_reg].remove(squashed_inst);
                    }
+
+                    ++iqSquashedOperandsExamined;
                }
+
+                // Might want to remove producers as well.
+            } else {
+                nonSpecInsts.erase(squashed_inst->seqNum);
+
+                ++iqSquashedNonSpecRemoved;
            }

            // Might want to also clear out the head of the dependency graph.
@ -590,11 +715,8 @@ InstructionQueue<Impl>::doSquash()
                    squashed_inst->readPC());
        }

-        if (squashed_inst->isNonSpeculative() || squashed_inst->isStore()) {
-            nonSpecInsts.erase(squashed_inst->seqNum);
-        }
-
        --squashIt;
+        ++iqSquashedInstsExamined;
    }
 }

@ -665,6 +787,8 @@ InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)

            dependGraph[dest_reg].next = curr->next;

+            DependencyEntry::mem_alloc_counter--;
+
            delete curr;
        }

@ -749,13 +873,9 @@ InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
        }

        dependGraph[dest_reg].inst = new_inst;
-#if 0
-        if (dependGraph[dest_reg].next) {
-            panic("Dependency chain of dest reg %i is not empty.\n",
-                  dest_reg);
-        }
-#endif
+
        assert(!dependGraph[dest_reg].next);
+
        // Mark the scoreboard to say it's not yet ready.
        regScoreboard[dest_reg] = false;
    }
@ -776,6 +896,8 @@ InstructionQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst)

    // Then actually add it to the chain.
    this->next = new_entry;
+
+    ++mem_alloc_counter;
 }

 template <class Impl>
@ -805,6 +927,8 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
    // Now remove this instruction from the list.
    prev->next = curr->next;

+    --mem_alloc_counter;
+
    delete curr;
 }

@ -855,12 +979,26 @@ InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)

            DPRINTF(IQ, "IQ: Checking if memory instruction can issue.\n");

+            // Message to the mem dependence unit that this instruction has
+            // its registers ready.
+
+            memDepUnit.regsReady(inst);
+
+#if 0
            if (memDepUnit.readyToIssue(inst)) {
                DPRINTF(IQ, "IQ: Memory instruction is ready to issue, "
                        "putting it onto the ready list, PC %#x.\n",
                        inst->readPC());
                readyMemInsts.push(inst);
+            } else {
+                // Make dependent on the store.
+                // Will need some way to get the store instruction it should
+                // be dependent upon; then when the store issues it can
+                // put the instruction on the ready list.
+                // Yet another tree?
+                assert(0 && "Instruction has no way to actually issue");
            }
+#endif

        } else if (inst->isInteger()) {

@ -923,7 +1061,7 @@ InstructionQueue<Impl>::dumpLists()

    cprintf("Ready branch list size: %i\n", readyBranchInsts.size());

-    cprintf("Ready memory list size: %i\n", readyMemInsts.size());
+//    cprintf("Ready memory list size: %i\n", readyMemInsts.size());

    cprintf("Ready misc list size: %i\n", readyMiscInsts.size());

--- a/cpu/beta_cpu/mem_dep_unit.hh
+++ b/cpu/beta_cpu/mem_dep_unit.hh
@ -6,6 +6,7 @@
 #include <map>

 #include "cpu/inst_seq.hh"
+#include "base/statistics.hh"

 /**
 * Memory dependency unit class.  This holds the memory dependence predictor.
@ -24,17 +25,18 @@ class MemDepUnit {
    typedef typename Impl::Params Params;
    typedef typename Impl::DynInstPtr DynInstPtr;

-  public:
-    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
-    typedef typename std::map<InstSeqNum, vector<InstSeqNum> >::iterator
-    dep_it_t;
-
  public:
    MemDepUnit(Params &params);

+    void regStats();
+
    void insert(DynInstPtr &inst);

-    bool readyToIssue(DynInstPtr &inst);
+    void insertNonSpec(DynInstPtr &inst);
+
+    void regsReady(DynInstPtr &inst);
+
+    void nonSpecInstReady(DynInstPtr &inst);

    void issue(DynInstPtr &inst);

@ -44,19 +46,83 @@ class MemDepUnit {

    void violation(DynInstPtr &store_inst, DynInstPtr &violating_load);

+    // Will want to make this operation relatively fast.  Right now it
+    // kind of sucks.
+    DynInstPtr &top();
+
+    void pop();
+
+    inline bool empty()
+    { return readyInsts.empty(); }
+
+  private:
+    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator dyn_it_t;
+
+    // Forward declarations so that the following two typedefs work.
+    class Dependency;
+    class ltDependency;
+
+    typedef typename std::set<Dependency, ltDependency>::iterator dep_it_t;
+    typedef typename std::map<InstSeqNum, vector<dep_it_t> >::iterator
+    sd_it_t;
+
+    struct Dependency {
+        Dependency(const InstSeqNum &_seqNum)
+            : seqNum(_seqNum), regsReady(0), memDepReady(0)
+        { }
+
+        Dependency(const InstSeqNum &_seqNum, bool _regsReady,
+                   bool _memDepReady)
+            : seqNum(_seqNum), regsReady(_regsReady),
+              memDepReady(_memDepReady)
+        { }
+
+        InstSeqNum seqNum;
+        mutable bool regsReady;
+        mutable bool memDepReady;
+        mutable sd_it_t storeDep;
+    };
+
+    struct ltDependency {
+        bool operator() (const Dependency &lhs, const Dependency &rhs)
+        {
+            return lhs.seqNum < rhs.seqNum;
+        }
+    };
+
+
+  private:
+    inline void moveToReady(dep_it_t &woken_inst);
+
  private:
    /** List of instructions that have passed through rename, yet are still
-     *  waiting on a memory dependence to resolve before they can issue.
+     *  waiting on either a memory dependence to resolve or source registers to
+     *  become available before they can issue.
     */
-    std::set<InstSeqNum> renamedInsts;
+    std::set<Dependency, ltDependency> waitingInsts;

    /** List of instructions that have all their predicted memory dependences
-     *  resolved.  They are ready in terms of being free of memory
-     *  dependences; however they may still have to wait on source registers.
+     *  resolved and their source registers ready.
     */
    std::set<InstSeqNum> readyInsts;

-    std::map<InstSeqNum, vector<InstSeqNum> > dependencies;
+    // Change this to hold a vector of iterators, which will point to the
+    // entry of the waiting instructions.
+    /** List of stores' sequence numbers, each of which has a vector of
+     *  iterators.  The iterators point to the appropriate node within
+     *  waitingInsts that has the depenendent instruction.
+     */
+    std::map<InstSeqNum, vector<dep_it_t> > storeDependents;
+
+    // For now will implement this as a map...hash table might not be too
+    // bad, or could move to something that mimics the current dependency
+    // graph.
+    std::map<InstSeqNum, DynInstPtr> memInsts;
+
+    // Iterator pointer to the top instruction which has is ready.
+    // Is set by the top() call.
+    dyn_it_t topInst;

    /** The memory dependence predictor.  It is accessed upon new
     *  instructions being added to the IQ, and responds by telling
@ -65,6 +131,10 @@ class MemDepUnit {
     */
    MemDepPred depPred;

+    Stats::Scalar<> insertedLoads;
+    Stats::Scalar<> insertedStores;
+    Stats::Scalar<> conflictingLoads;
+    Stats::Scalar<> conflictingStores;
 };

 #endif
--- a/cpu/beta_cpu/mem_dep_unit_impl.hh
+++ b/cpu/beta_cpu/mem_dep_unit_impl.hh
@ -3,60 +3,236 @@

 #include "cpu/beta_cpu/mem_dep_unit.hh"

-// Hack: dependence predictor sizes are hardcoded.
 template <class MemDepPred, class Impl>
 MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params &params)
-    : depPred(4028, 128)
+    : depPred(params.SSITSize, params.LFSTSize)
 {
    DPRINTF(MemDepUnit, "MemDepUnit: Creating MemDepUnit object.\n");
 }

+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::regStats()
+{
+    insertedLoads
+        .name(name() + ".memDep.insertedLoads")
+        .desc("Number of loads inserted to the mem dependence unit.");
+
+    insertedStores
+        .name(name() + ".memDep.insertedStores")
+        .desc("Number of stores inserted to the mem dependence unit.");
+
+    conflictingLoads
+        .name(name() + ".memDep.conflictingLoads")
+        .desc("Number of conflicting loads.");
+
+    conflictingStores
+        .name(name() + ".memDep.conflictingStores")
+        .desc("Number of conflicting stores.");
+}
+
 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
 {
    InstSeqNum inst_seq_num = inst->seqNum;

+    Dependency unresolved_dependencies(inst_seq_num);

    InstSeqNum producing_store = depPred.checkInst(inst->readPC());

    if (producing_store == 0 ||
-        dependencies.find(producing_store) == dependencies.end()) {
-        readyInsts.insert(inst_seq_num);
+        storeDependents.find(producing_store) == storeDependents.end()) {
+
+        DPRINTF(MemDepUnit, "MemDepUnit: No dependency for inst PC "
+                "%#x.\n", inst->readPC());
+
+        unresolved_dependencies.storeDep = storeDependents.end();
+
+        if (inst->readyToIssue()) {
+            readyInsts.insert(inst_seq_num);
+        } else {
+            unresolved_dependencies.memDepReady = true;
+
+            waitingInsts.insert(unresolved_dependencies);
+        }
    } else {
+        DPRINTF(MemDepUnit, "MemDepUnit: Adding to dependency list; "
+                "inst PC %#x is dependent on seq num %i.\n",
+                inst->readPC(), producing_store);
+
+        if (inst->readyToIssue()) {
+            unresolved_dependencies.regsReady = true;
+        }
+
+        // Find the store that this instruction is dependent on.
+        sd_it_t store_loc = storeDependents.find(producing_store);
+
+        assert(store_loc != storeDependents.end());
+
+        // Record the location of the store that this instruction is
+        // dependent on.
+        unresolved_dependencies.storeDep = store_loc;
+
        // If it's not already ready, then add it to the renamed
        // list and the dependencies.
-        renamedInsts.insert(inst_seq_num);
+        dep_it_t inst_loc =
+            (waitingInsts.insert(unresolved_dependencies)).first;

-        dependencies[producing_store].push_back(inst_seq_num);
+        // Add this instruction to the list of dependents.
+        (*store_loc).second.push_back(inst_loc);
+
+        assert(!(*store_loc).second.empty());
+
+        if (inst->isLoad()) {
+            ++conflictingLoads;
+        } else {
+            ++conflictingStores;
+        }
    }

    if (inst->isStore()) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
+                inst->readPC());
+
        depPred.insertStore(inst->readPC(), inst_seq_num);

        // Make sure this store isn't already in this list.
-        assert(dependencies.find(inst_seq_num) == dependencies.end());
+        assert(storeDependents.find(inst_seq_num) == storeDependents.end());

        // Put a dependency entry in at the store's sequence number.
        // Uh, not sure how this works...I want to create an entry but
        // I don't have anything to put into the value yet.
-        dependencies[inst_seq_num];
-    } else if (!inst->isLoad()) {
+        storeDependents[inst_seq_num];
+
+        assert(storeDependents.size() != 0);
+
+        ++insertedStores;
+
+    } else if (inst->isLoad()) {
+        ++insertedLoads;
+    } else {
        panic("MemDepUnit: Unknown type! (most likely a barrier).");
    }
+
+    memInsts[inst_seq_num] = inst;
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::insertNonSpec(DynInstPtr &inst)
+{
+    InstSeqNum inst_seq_num = inst->seqNum;
+
+    Dependency non_spec_inst(inst_seq_num);
+
+    non_spec_inst.storeDep = storeDependents.end();
+
+    waitingInsts.insert(non_spec_inst);
+
+    // Might want to turn this part into an inline function or something.
+    // It's shared between both insert functions.
+    if (inst->isStore()) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Inserting store PC %#x.\n",
+                inst->readPC());
+
+        depPred.insertStore(inst->readPC(), inst_seq_num);
+
+        // Make sure this store isn't already in this list.
+        assert(storeDependents.find(inst_seq_num) == storeDependents.end());
+
+        // Put a dependency entry in at the store's sequence number.
+        // Uh, not sure how this works...I want to create an entry but
+        // I don't have anything to put into the value yet.
+        storeDependents[inst_seq_num];
+
+        assert(storeDependents.size() != 0);
+
+        ++insertedStores;
+
+    } else if (inst->isLoad()) {
+        ++insertedLoads;
+    } else {
+        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+    }
+
+    memInsts[inst_seq_num] = inst;
+}
+
+template <class MemDepPred, class Impl>
+typename Impl::DynInstPtr &
+MemDepUnit<MemDepPred, Impl>::top()
+{
+    topInst = memInsts.find( (*readyInsts.begin()) );
+
+    DPRINTF(MemDepUnit, "MemDepUnit: Top instruction is PC %#x.\n",
+            (*topInst).second->readPC());
+
+    return (*topInst).second;
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::pop()
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Removing instruction PC %#x.\n",
+            (*topInst).second->readPC());
+
+    wakeDependents((*topInst).second);
+
+    issue((*topInst).second);
+
+    memInsts.erase(topInst);
+
+    topInst = memInsts.end();
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::regsReady(DynInstPtr &inst)
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Marking registers as ready for "
+            "instruction PC %#x.\n",
+            inst->readPC());
+
+    InstSeqNum inst_seq_num = inst->seqNum;
+
+    Dependency inst_to_find(inst_seq_num);
+
+    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
+
+    assert(waiting_inst != waitingInsts.end());
+
+    if ((*waiting_inst).memDepReady) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Instruction has its memory "
+                "dependencies resolved, adding it to the ready list.\n");
+
+        moveToReady(waiting_inst);
+    } else {
+        DPRINTF(MemDepUnit, "MemDepUnit: Instruction still waiting on "
+                "memory dependency.\n");
+
+        (*waiting_inst).regsReady = true;
+    }
 }

 template <class MemDepPred, class Impl>
-bool
-MemDepUnit<MemDepPred, Impl>::readyToIssue(DynInstPtr &inst)
+void
+MemDepUnit<MemDepPred, Impl>::nonSpecInstReady(DynInstPtr &inst)
 {
+    DPRINTF(MemDepUnit, "MemDepUnit: Marking non speculative "
+            "instruction PC %#x as ready.\n",
+            inst->readPC());
+
    InstSeqNum inst_seq_num = inst->seqNum;

-    if (readyInsts.find(inst_seq_num) == readyInsts.end()) {
-        return false;
-    } else {
-        return true;
-    }
+    Dependency inst_to_find(inst_seq_num);
+
+    dep_it_t waiting_inst = waitingInsts.find(inst_to_find);
+
+    assert(waiting_inst != waitingInsts.end());
+
+    moveToReady(waiting_inst);
 }

 template <class MemDepPred, class Impl>
@ -65,46 +241,63 @@ MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
 {
    assert(readyInsts.find(inst->seqNum) != readyInsts.end());

+    DPRINTF(MemDepUnit, "MemDepUnit: Issuing instruction PC %#x.\n",
+            inst->readPC());
+
    // Remove the instruction from the ready list.
    readyInsts.erase(inst->seqNum);
+
+    depPred.issued(inst->readPC(), inst->seqNum, inst->isStore());
 }

 template <class MemDepPred, class Impl>
 void
 MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 {
-    // Wake any dependencies.
-    dep_it_t dep_it = dependencies.find(inst);
-
-    // If there's no entry, then return.  Really there should only be
-    // no entry if the instruction is a load.
-    if (dep_it == dependencies.end()) {
+    // Only stores have dependents.
+    if (!inst->isStore()) {
        return;
    }

-    assert(inst->isStore());
+    // Wake any dependencies.
+    sd_it_t sd_it = storeDependents.find(inst->seqNum);

-    for(int i = 0; i < (*dep_it).second.size(); ++i ) {
-        InstSeqNum woken_inst = (*dep_it).second[i];
+    // If there's no entry, then return.  Really there should only be
+    // no entry if the instruction is a load.
+    if (sd_it == storeDependents.end()) {
+        DPRINTF(MemDepUnit, "MemDepUnit: Instruction PC %#x, sequence "
+                "number %i has no dependents.\n",
+                inst->readPC(), inst->seqNum);

+        return;
+    }
+
+    for (int i = 0; i < (*sd_it).second.size(); ++i ) {
+        dep_it_t woken_inst = (*sd_it).second[i];
+
+        DPRINTF(MemDepUnit, "MemDepUnit: Waking up a dependent inst, "
+                "sequence number %i.\n",
+                (*woken_inst).seqNum);
+#if 0
        // Should we have reached instructions that are actually squashed,
        // there will be no more useful instructions in this dependency
        // list.  Break out early.
-        if (renamedInsts.find(woken_inst) == renamedInsts.end()) {
+        if (waitingInsts.find(woken_inst) == waitingInsts.end()) {
            DPRINTF(MemDepUnit, "MemDepUnit: Dependents on inst PC %#x "
                    "are squashed, starting at SN %i.  Breaking early.\n",
                    inst->readPC(), woken_inst);
            break;
        }
+#endif

-        // Remove it from the renamed instructions.
-        renamedInsts.erase(woken_inst);
-
-        // Add it to the ready list.
-        readyInsts.insert(woken_inst);
+        if ((*woken_inst).regsReady) {
+            moveToReady(woken_inst);
+        } else {
+            (*woken_inst).memDepReady = true;
+        }
    }

-    dependencies.erase(dep_it);
+    storeDependents.erase(sd_it);
 }

 template <class MemDepPred, class Impl>
@ -112,17 +305,30 @@ void
 MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
 {

-    if (!renamedInsts.empty()) {
-        sn_it_t renamed_it = renamedInsts.end();
+    if (!waitingInsts.empty()) {
+        dep_it_t waiting_it = waitingInsts.end();

-        --renamed_it;
+        --waiting_it;

        // Remove entries from the renamed list as long as we haven't reached
        // the end and the entries continue to be younger than the squashed.
-        while (!renamedInsts.empty() &&
-               (*renamed_it) > squashed_num)
+        while (!waitingInsts.empty() &&
+               (*waiting_it).seqNum > squashed_num)
        {
-            renamedInsts.erase(renamed_it--);
+            if (!(*waiting_it).memDepReady &&
+                (*waiting_it).storeDep != storeDependents.end()) {
+                sd_it_t sd_it = (*waiting_it).storeDep;
+
+                // Make sure the iterator that the store has pointing
+                // back is actually to this instruction.
+                assert((*sd_it).second.back() == waiting_it);
+
+                // Now remove this from the store's list of dependent
+                // instructions.
+                (*sd_it).second.pop_back();
+            }
+
+            waitingInsts.erase(waiting_it--);
        }
    }

@ -139,16 +345,19 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
        }
    }

-    if (!dependencies.empty()) {
-        dep_it_t dep_it = dependencies.end();
+    if (!storeDependents.empty()) {
+        sd_it_t dep_it = storeDependents.end();

        --dep_it;

        // Same for the dependencies list.
-        while (!dependencies.empty() &&
+        while (!storeDependents.empty() &&
               (*dep_it).first > squashed_num)
        {
-            dependencies.erase(dep_it--);
+            // This store's list of dependent instructions should be empty.
+            assert((*dep_it).second.empty());
+
+            storeDependents.erase(dep_it--);
        }
    }

@ -161,6 +370,23 @@ void
 MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst,
                                        DynInstPtr &violating_load)
 {
+    DPRINTF(MemDepUnit, "MemDepUnit: Passing violating PCs to store sets,"
+            " load: %#x, store: %#x\n", violating_load->readPC(),
+            store_inst->readPC());
    // Tell the memory dependence unit of the violation.
    depPred.violation(violating_load->readPC(), store_inst->readPC());
 }
+
+template <class MemDepPred, class Impl>
+inline void
+MemDepUnit<MemDepPred, Impl>::moveToReady(dep_it_t &woken_inst)
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Adding instruction sequence number %i "
+            "to the ready list.\n", (*woken_inst).seqNum);
+
+    // Add it to the ready list.
+    readyInsts.insert((*woken_inst).seqNum);
+
+    // Remove it from the waiting instructions.
+    waitingInsts.erase(woken_inst);
+}
--- a/cpu/beta_cpu/ras.cc
+++ b/cpu/beta_cpu/ras.cc
@ -0,0 +1,42 @@
+#include "cpu/beta_cpu/ras.hh"
+
+ReturnAddrStack::ReturnAddrStack(unsigned _numEntries)
+    : numEntries(_numEntries), usedEntries(0),
+      tos(0)
+{
+    addrStack = new Addr[numEntries](0);
+}
+
+void
+ReturnAddrStack::push(const Addr &return_addr)
+{
+    incrTos();
+
+    addrStack[tos] = return_addr;
+
+    if (usedEntries != numEntries) {
+        ++usedEntries;
+    }
+}
+
+void
+ReturnAddrStack::pop()
+{
+    // Not sure it's possible to really track usedEntries properly.
+//    assert(usedEntries > 0);
+
+    if (usedEntries > 0) {
+        --usedEntries;
+    }
+
+    decrTos();
+}
+
+void
+ReturnAddrStack::restore(unsigned top_entry_idx,
+                         const Addr &restored_target)
+{
+    tos = top_entry_idx;
+
+    addrStack[tos] = restored_target;
+}
--- a/cpu/beta_cpu/ras.hh
+++ b/cpu/beta_cpu/ras.hh
@ -0,0 +1,40 @@
+#ifndef __RAS_HH__
+#define __RAS_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+class ReturnAddrStack
+{
+  public:
+    ReturnAddrStack(unsigned numEntries);
+
+    Addr top()
+    { return addrStack[tos]; }
+
+    unsigned topIdx()
+    { return tos; }
+
+    void push(const Addr &return_addr);
+
+    void pop();
+
+    void restore(unsigned top_entry_idx, const Addr &restored_target);
+
+  private:
+    inline void incrTos()
+    { tos = (tos + 1) % numEntries; }
+
+    inline void decrTos()
+    { tos = (tos == 0 ? numEntries - 1 : tos - 1); }
+
+    Addr *addrStack;
+
+    unsigned numEntries;
+
+    unsigned usedEntries;
+
+    unsigned tos;
+};
+
+#endif // __RAS_HH__
--- a/cpu/beta_cpu/regfile.hh
+++ b/cpu/beta_cpu/regfile.hh
@ -54,7 +54,7 @@ class PhysRegFile
        // Remove the base Float reg dependency.
        reg_idx = reg_idx - numPhysicalIntRegs;

-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);

        DPRINTF(IEW, "RegFile: Access to float register %i as single, has "
                "data %8.8f\n", int(reg_idx), (float)floatRegFile[reg_idx].d);
@ -67,7 +67,7 @@ class PhysRegFile
        // Remove the base Float reg dependency.
        reg_idx = reg_idx - numPhysicalIntRegs;

-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);

        DPRINTF(IEW, "RegFile: Access to float register %i as double, has "
                " data %8.8f\n", int(reg_idx), floatRegFile[reg_idx].d);
@ -80,7 +80,7 @@ class PhysRegFile
        // Remove the base Float reg dependency.
        reg_idx = reg_idx - numPhysicalIntRegs;

-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);

        DPRINTF(IEW, "RegFile: Access to float register %i as int, has data "
                "%lli\n", int(reg_idx), floatRegFile[reg_idx].q);
@ -103,7 +103,7 @@ class PhysRegFile
        // Remove the base Float reg dependency.
        reg_idx = reg_idx - numPhysicalIntRegs;

-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);

        DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                int(reg_idx), val);
@ -116,7 +116,7 @@ class PhysRegFile
        // Remove the base Float reg dependency.
        reg_idx = reg_idx - numPhysicalIntRegs;

-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);

        DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                int(reg_idx), val);
@ -129,7 +129,7 @@ class PhysRegFile
        // Remove the base Float reg dependency.
        reg_idx = reg_idx - numPhysicalIntRegs;

-        assert(reg_idx < numPhysicalFloatRegs);
+        assert(reg_idx < numPhysicalFloatRegs + numPhysicalIntRegs);

        DPRINTF(IEW, "RegFile: Setting float register %i to %lli\n",
                int(reg_idx), val);
--- a/cpu/beta_cpu/rename.hh
+++ b/cpu/beta_cpu/rename.hh
@ -54,6 +54,8 @@ class SimpleRename
  public:
    SimpleRename(Params &params);

+    void regStats();
+
    void setCPU(FullCPU *cpu_ptr);

    void setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr);
@ -182,6 +184,22 @@ class SimpleRename
     *  group of instructions, it can restart at the proper instruction.
     */
    unsigned numInst;
+
+    Stats::Scalar<> renameSquashCycles;
+    Stats::Scalar<> renameIdleCycles;
+    Stats::Scalar<> renameBlockCycles;
+    Stats::Scalar<> renameUnblockCycles;
+    Stats::Scalar<> renameRenamedInsts;
+    Stats::Scalar<> renameSquashedInsts;
+    Stats::Scalar<> renameROBFullEvents;
+    Stats::Scalar<> renameIQFullEvents;
+    Stats::Scalar<> renameFullRegistersEvents;
+    Stats::Scalar<> renameRenamedOperands;
+    Stats::Scalar<> renameRenameLookups;
+    Stats::Scalar<> renameHBPlaceHolders;
+    Stats::Scalar<> renameCommittedMaps;
+    Stats::Scalar<> renameUndoneMaps;
+    Stats::Scalar<> renameValidUndoneMaps;
 };

 #endif // __SIMPLE_RENAME_HH__
--- a/cpu/beta_cpu/rename_impl.hh
+++ b/cpu/beta_cpu/rename_impl.hh
@ -14,6 +14,72 @@ SimpleRename<Impl>::SimpleRename(Params &params)
    _status = Idle;
 }

+template <class Impl>
+void
+SimpleRename<Impl>::regStats()
+{
+    renameSquashCycles
+        .name(name() + ".renameSquashCycles")
+        .desc("Number of cycles rename is squashing")
+        .prereq(renameSquashCycles);
+    renameIdleCycles
+        .name(name() + ".renameIdleCycles")
+        .desc("Number of cycles rename is idle")
+        .prereq(renameIdleCycles);
+    renameBlockCycles
+        .name(name() + ".renameBlockCycles")
+        .desc("Number of cycles rename is blocking")
+        .prereq(renameBlockCycles);
+    renameUnblockCycles
+        .name(name() + ".renameUnblockCycles")
+        .desc("Number of cycles rename is unblocking")
+        .prereq(renameUnblockCycles);
+    renameRenamedInsts
+        .name(name() + ".renameRenamedInsts")
+        .desc("Number of instructions processed by rename")
+        .prereq(renameRenamedInsts);
+    renameSquashedInsts
+        .name(name() + ".renameSquashedInsts")
+        .desc("Number of squashed instructions processed by rename")
+        .prereq(renameSquashedInsts);
+    renameROBFullEvents
+        .name(name() + ".renameROBFullEvents")
+        .desc("Number of times rename has considered the ROB 'full'")
+        .prereq(renameROBFullEvents);
+    renameIQFullEvents
+        .name(name() + ".renameIQFullEvents")
+        .desc("Number of times rename has considered the IQ 'full'")
+        .prereq(renameIQFullEvents);
+    renameFullRegistersEvents
+        .name(name() + ".renameFullRegisterEvents")
+        .desc("Number of times there has been no free registers")
+        .prereq(renameFullRegistersEvents);
+    renameRenamedOperands
+        .name(name() + ".renameRenamedOperands")
+        .desc("Number of destination operands rename has renamed")
+        .prereq(renameRenamedOperands);
+    renameRenameLookups
+        .name(name() + ".renameRenameLookups")
+        .desc("Number of register rename lookups that rename has made")
+        .prereq(renameRenameLookups);
+    renameHBPlaceHolders
+        .name(name() + ".renameHBPlaceHolders")
+        .desc("Number of place holders added to the history buffer")
+        .prereq(renameHBPlaceHolders);
+    renameCommittedMaps
+        .name(name() + ".renameCommittedMaps")
+        .desc("Number of HB maps that are committed")
+        .prereq(renameCommittedMaps);
+    renameUndoneMaps
+        .name(name() + ".renameUndoneMaps")
+        .desc("Number of HB maps that are undone due to squashing")
+        .prereq(renameUndoneMaps);
+    renameValidUndoneMaps
+        .name(name() + ".renameValidUndoneMaps")
+        .desc("Number of HB maps that are undone, and are not place holders")
+        .prereq(renameValidUndoneMaps);
+}
+
 template <class Impl>
 void
 SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr)
@ -59,7 +125,6 @@ SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)

    // Setup wire to get information from decode.
    fromDecode = decodeQueue->getWire(-decodeToRenameDelay);
-
 }

 template <class Impl>
@ -124,7 +189,7 @@ SimpleRename<Impl>::unblock()
    // continue to tell previous stages to stall.  They will be
    // able to restart once the skid buffer is empty.
    if (!skidBuffer.empty()) {
-                toDecode->renameInfo.stall = true;
+        toDecode->renameInfo.stall = true;
    } else {
        DPRINTF(Rename, "Rename: Done unblocking.\n");
        _status = Running;
@ -136,7 +201,6 @@ void
 SimpleRename<Impl>::doSquash()
 {
    typename list<RenameHistory>::iterator hb_it = historyBuffer.begin();
-//    typename list<RenameHistory>::iterator delete_it;

    InstSeqNum squashed_seq_num = fromCommit->commitInfo.doneSeqNum;

@ -154,6 +218,8 @@ SimpleRename<Impl>::doSquash()
    // they did and freeing up the registers.
    while ((*hb_it).instSeqNum > squashed_seq_num)
    {
+        assert(hb_it != historyBuffer.end());
+
        DPRINTF(Rename, "Rename: Removing history entry with sequence "
                "number %i.\n", (*hb_it).instSeqNum);

@ -165,15 +231,13 @@ SimpleRename<Impl>::doSquash()

            // Put the renamed physical register back on the free list.
            freeList->addReg(hb_it->newPhysReg);
+
+            ++renameValidUndoneMaps;
        }

-//        delete_it = hb_it;
-
-//        hb_it++;
-
        historyBuffer.erase(hb_it++);

-        assert(hb_it != historyBuffer.end());
+        ++renameUndoneMaps;
    }
 }

@ -196,9 +260,6 @@ SimpleRename<Impl>::squash()
    doSquash();
 }

-// In the future, when a SmartPtr is used for DynInst, then this function
-// itself can handle returning the instruction's physical registers to
-// the free list.
 template<class Impl>
 void
 SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
@ -233,19 +294,20 @@ SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)

        if (!(*hb_it).placeHolder) {
            freeList->addReg((*hb_it).prevPhysReg);
+            ++renameCommittedMaps;
        }

        historyBuffer.erase(hb_it--);
    }

-    // Finally free up the previous register of the squashed instruction
+    // Finally free up the previous register of the finished instruction
    // itself.
    if (!(*hb_it).placeHolder) {
        freeList->addReg(hb_it->prevPhysReg);
+        ++renameCommittedMaps;
    }

    historyBuffer.erase(hb_it);
-
 }

 template <class Impl>
@ -263,7 +325,7 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)

        // Look up the source registers to get the phys. register they've
        // been renamed to, and set the sources to those registers.
-        RegIndex renamed_reg = renameMap->lookup(src_reg);
+        PhysRegIndex renamed_reg = renameMap->lookup(src_reg);

        DPRINTF(Rename, "Rename: Looking up arch reg %i, got "
                "physical reg %i.\n", (int)src_reg, (int)renamed_reg);
@ -278,6 +340,8 @@ SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)

            inst->markSrcRegReady(src_idx);
        }
+
+        ++renameRenameLookups;
    }
 }

@ -289,40 +353,6 @@ SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)

    unsigned num_dest_regs = inst->numDestRegs();

-    // Rename the destination registers.
-    for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
-    {
-        RegIndex dest_reg = inst->destRegIdx(dest_idx);
-
-        // Get the physical register that the destination will be
-        // renamed to.
-        rename_result = renameMap->rename(dest_reg);
-
-        DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
-                "reg %i.\n", (int)dest_reg,
-                (int)rename_result.first);
-
-        // Record the rename information so that a history can be kept.
-        RenameHistory hb_entry(inst->seqNum, dest_reg,
-                               rename_result.first,
-                               rename_result.second);
-
-        historyBuffer.push_front(hb_entry);
-
-        DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
-                "sequence number %lli.\n",
-                (*historyBuffer.begin()).instSeqNum);
-
-        // Tell the instruction to rename the appropriate destination
-        // register (dest_idx) to the new physical register
-        // (rename_result.first), and record the previous physical
-        // register that the same logical register was renamed to
-        // (rename_result.second).
-        inst->renameDestReg(dest_idx,
-                            rename_result.first,
-                            rename_result.second);
-    }
-
    // If it's an instruction with no destination registers, then put
    // a placeholder within the history buffer.  It might be better
    // to not put it in the history buffer at all (other than branches,
@ -337,6 +367,45 @@ SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)
        DPRINTF(Rename, "Rename: Adding placeholder instruction to "
                "history buffer, sequence number %lli.\n",
                inst->seqNum);
+
+        ++renameHBPlaceHolders;
+    } else {
+
+        // Rename the destination registers.
+        for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
+        {
+            RegIndex dest_reg = inst->destRegIdx(dest_idx);
+
+            // Get the physical register that the destination will be
+            // renamed to.
+            rename_result = renameMap->rename(dest_reg);
+
+            DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
+                    "reg %i.\n", (int)dest_reg,
+                    (int)rename_result.first);
+
+            // Record the rename information so that a history can be kept.
+            RenameHistory hb_entry(inst->seqNum, dest_reg,
+                                   rename_result.first,
+                                   rename_result.second);
+
+            historyBuffer.push_front(hb_entry);
+
+            DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
+                    "sequence number %lli.\n",
+                    (*historyBuffer.begin()).instSeqNum);
+
+            // Tell the instruction to rename the appropriate destination
+            // register (dest_idx) to the new physical register
+            // (rename_result.first), and record the previous physical
+            // register that the same logical register was renamed to
+            // (rename_result.second).
+            inst->renameDestReg(dest_idx,
+                                rename_result.first,
+                                rename_result.second);
+
+            ++renameRenamedOperands;
+        }
    }
 }

@ -379,6 +448,8 @@ SimpleRename<Impl>::tick()
        // buffer were used.  Remove those instructions and handle
        // the rest of unblocking.
        if (_status == Unblocking) {
+            ++renameUnblockCycles;
+
            if (fromDecode->size > 0) {
                // Add the current inputs onto the skid buffer, so they can be
                // reprocessed when this stage unblocks.
@ -388,6 +459,8 @@ SimpleRename<Impl>::tick()
            unblock();
        }
    } else if (_status == Blocked) {
+        ++renameBlockCycles;
+
        // If stage is blocked and still receiving valid instructions,
        // make sure to store them in the skid buffer.
        if (fromDecode->size > 0) {
@ -425,6 +498,8 @@ SimpleRename<Impl>::tick()
            return;
        }
    } else if (_status == Squashing) {
+        ++renameSquashCycles;
+
        if (fromCommit->commitInfo.squash) {
            squash();
        } else if (!fromCommit->commitInfo.squash &&
@ -439,7 +514,13 @@ SimpleRename<Impl>::tick()

    // Ugly code, revamp all of the tick() functions eventually.
    if (fromCommit->commitInfo.doneSeqNum != 0 && _status != Squashing) {
+#ifndef FULL_SYSTEM
+        if (!fromCommit->commitInfo.squash) {
+            removeFromHistory(fromCommit->commitInfo.doneSeqNum);
+        }
+#else
        removeFromHistory(fromCommit->commitInfo.doneSeqNum);
+#endif
    }

    // Perhaps put this outside of this function, since this will
@ -539,6 +620,12 @@ SimpleRename<Impl>::rename()
        // Tell previous stage to stall.
        toDecode->renameInfo.stall = true;

+        if (free_rob_entries <= 0) {
+            ++renameROBFullEvents;
+        } else {
+            ++renameIQFullEvents;
+        }
+
        return;
    } else if (min_iq_rob < insts_available) {
        DPRINTF(Rename, "Rename: Will have to block this cycle.  Only "
@ -548,6 +635,12 @@ SimpleRename<Impl>::rename()
        insts_available = min_iq_rob;

        block_this_cycle = true;
+
+        if (free_rob_entries < free_iq_entries) {
+            ++renameROBFullEvents;
+        } else {
+            ++renameIQFullEvents;
+        }
    }

    while (insts_available > 0) {
@ -566,6 +659,8 @@ SimpleRename<Impl>::rename()
            // Go to the next instruction.
            ++numInst;

+            ++renameSquashedInsts;
+
            // Decrement how many instructions are available.
            --insts_available;

@ -606,6 +701,8 @@ SimpleRename<Impl>::rename()

            block_this_cycle = true;

+            ++renameFullRegistersEvents;
+
            break;
        }

@ -625,6 +722,8 @@ SimpleRename<Impl>::rename()
        ++to_iew_index;
        ++numInst;

+        ++renameRenamedInsts;
+
        // Decrement how many instructions are available.
        --insts_available;
    }
--- a/cpu/beta_cpu/rename_map.cc
+++ b/cpu/beta_cpu/rename_map.cc
@ -72,7 +72,7 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
        floatRenameMap[index].physical_reg = float_reg_idx++;
    }

-    for (RegIndex index = numPhysicalIntRegs;
+    for (PhysRegIndex index = numPhysicalIntRegs;
         index < numPhysicalIntRegs + numLogicalFloatRegs; ++index)
    {
        floatScoreboard[index] = 1;
@ -88,7 +88,7 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
    }

    // Initialize the entries in the misc register scoreboard to be ready.
-    for (RegIndex index = numPhysicalRegs;
+    for (PhysRegIndex index = numPhysicalRegs;
         index < numPhysicalRegs + numMiscRegs; ++index)
    {
        miscScoreboard[index] = 1;
--- a/cpu/beta_cpu/rob_impl.hh
+++ b/cpu/beta_cpu/rob_impl.hh
@ -139,9 +139,7 @@ bool
 ROB<Impl>::isHeadReady()
 {
    if (numInstsInROB != 0) {
-        DynInstPtr head_inst = cpu->instList.front();
-
-        return head_inst->readyToCommit();
+        return cpu->instList.front()->readyToCommit();
    }

    return false;
--- a/cpu/beta_cpu/store_set.cc
+++ b/cpu/beta_cpu/store_set.cc
@ -5,6 +5,8 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
    : SSIT_size(_SSIT_size), LFST_size(_LFST_size)
 {
    DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
+    DPRINTF(StoreSet, "StoreSet: SSIT size: %i, LFST size: %i.\n",
+            SSIT_size, LFST_size);

    SSIT = new SSID[SSIT_size];

@ -31,11 +33,13 @@ StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
 }

 void
-StoreSet::violation(Addr load_PC, Addr store_PC)
+StoreSet::violation(Addr store_PC, Addr load_PC)
 {
    int load_index = calcIndex(load_PC);
    int store_index = calcIndex(store_PC);

+    assert(load_index < SSIT_size && store_index < SSIT_size);
+
    bool valid_load_SSID = validSSIT[load_index];
    bool valid_store_SSID = validSSIT[store_index];

@ -51,7 +55,14 @@ StoreSet::violation(Addr load_PC, Addr store_PC)

        SSIT[store_index] = new_set;

+        assert(new_set < LFST_size);
+
        SSCounters[new_set]++;
+
+
+        DPRINTF(StoreSet, "StoreSet: Neither load nor store had a valid "
+                "storeset, creating a new one: %i for load %#x, store %#x\n",
+                new_set, load_PC, store_PC);
    } else if (valid_load_SSID && !valid_store_SSID) {
        SSID load_SSID = SSIT[load_index];

@ -59,7 +70,13 @@ StoreSet::violation(Addr load_PC, Addr store_PC)

        SSIT[store_index] = load_SSID;

+        assert(load_SSID < LFST_size);
+
        SSCounters[load_SSID]++;
+
+        DPRINTF(StoreSet, "StoreSet: Load had a valid store set.  Adding "
+                "store to that set: %i for load %#x, store %#x\n",
+                load_SSID, load_PC, store_PC);
    } else if (!valid_load_SSID && valid_store_SSID) {
        SSID store_SSID = SSIT[store_index];

@ -69,10 +86,16 @@ StoreSet::violation(Addr load_PC, Addr store_PC)

        // Because we are having a load point to an already existing set,
        // the size of the store set is not incremented.
+
+        DPRINTF(StoreSet, "StoreSet: Store had a valid store set: %i for "
+                "load %#x, store %#x\n",
+                store_SSID, load_PC, store_PC);
    } else {
        SSID load_SSID = SSIT[load_index];
        SSID store_SSID = SSIT[store_index];

+        assert(load_SSID < LFST_size && store_SSID < LFST_size);
+
        int load_SS_size = SSCounters[load_SSID];
        int store_SS_size = SSCounters[store_SSID];

@ -83,11 +106,19 @@ StoreSet::violation(Addr load_PC, Addr store_PC)

            SSCounters[load_SSID]++;
            SSCounters[store_SSID]--;
+
+            DPRINTF(StoreSet, "StoreSet: Load had bigger store set: %i; "
+                    "for load %#x, store %#x\n",
+                    load_SSID, load_PC, store_PC);
        } else {
            SSIT[load_index] = store_SSID;

            SSCounters[store_SSID]++;
            SSCounters[load_SSID]--;
+
+            DPRINTF(StoreSet, "StoreSet: Store had bigger store set: %i; "
+                    "for load %#x, store %#x\n",
+                    store_SSID, load_PC, store_PC);
        }
    }
 }
@ -106,6 +137,8 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)

    int store_SSID;

+    assert(index < SSIT_size);
+
    if (!validSSIT[index]) {
        // Do nothing if there's no valid entry.
        return;
@ -116,6 +149,11 @@ StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)

        // Update the last store that was fetched with the current one.
        LFST[store_SSID] = store_seq_num;
+
+        validLFST[store_SSID] = 1;
+
+        DPRINTF(StoreSet, "Store %#x updated the LFST, SSID: %i\n",
+                store_PC, store_SSID);
    }
 }

@ -126,7 +164,12 @@ StoreSet::checkInst(Addr PC)

    int inst_SSID;

+    assert(index < SSIT_size);
+
    if (!validSSIT[index]) {
+        DPRINTF(StoreSet, "Inst %#x with index %i had no SSID\n",
+                PC, index);
+
        // Return 0 if there's no valid entry.
        return 0;
    } else {
@ -135,8 +178,15 @@ StoreSet::checkInst(Addr PC)
        assert(inst_SSID < LFST_size);

        if (!validLFST[inst_SSID]) {
+
+            DPRINTF(StoreSet, "Inst %#x with index %i and SSID %i had no "
+                    "dependency\n", PC, index, inst_SSID);
+
            return 0;
        } else {
+            DPRINTF(StoreSet, "Inst %#x with index %i and SSID %i had LFST "
+                    "inum of %i\n", PC, index, inst_SSID, LFST[inst_SSID]);
+
            return LFST[inst_SSID];
        }
    }
@ -154,14 +204,21 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)

    int store_SSID;

+    assert(index < SSIT_size);
+
    // Make sure the SSIT still has a valid entry for the issued store.
-    assert(validSSIT[index]);
+    if (!validSSIT[index]) {
+        return;
+    }

    store_SSID = SSIT[index];

+    assert(store_SSID < LFST_size);
+
    // If the last fetched store in the store set refers to the store that
    // was just issued, then invalidate the entry.
    if (validLFST[store_SSID] && LFST[store_SSID] == issued_seq_num) {
+        DPRINTF(StoreSet, "StoreSet: store invalidated itself in LFST.\n");
        validLFST[store_SSID] = false;
    }
 }
@ -170,9 +227,14 @@ void
 StoreSet::squash(InstSeqNum squashed_num)
 {
    // Not really sure how to do this well.
+    // Generally this is small enough that it should be okay; short circuit
+    // evaluation should take care of invalid entries.
+
+    DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
+            squashed_num);

    for (int i = 0; i < LFST_size; ++i) {
-        if (LFST[i] < squashed_num) {
+        if (validLFST[i] && LFST[i] < squashed_num) {
            validLFST[i] = false;
        }
    }
--- a/cpu/beta_cpu/store_set.hh
+++ b/cpu/beta_cpu/store_set.hh
@ -14,7 +14,7 @@ class StoreSet
  public:
    StoreSet(int SSIT_size, int LFST_size);

-    void violation(Addr load_PC, Addr store_PC);
+    void violation(Addr store_PC, Addr load_PC);

    void insertLoad(Addr load_PC, InstSeqNum load_seq_num);

--- a/cpu/beta_cpu/tournament_pred.cc
+++ b/cpu/beta_cpu/tournament_pred.cc
@ -0,0 +1,243 @@
+#include "cpu/beta_cpu/tournament_pred.hh"
+
+TournamentBP::SatCounter::SatCounter(unsigned bits)
+    : maxVal((1 << bits) - 1), counter(0)
+{
+}
+
+TournamentBP::SatCounter::SatCounter(unsigned bits, unsigned initial_val)
+    : maxVal((1 << bits) - 1), counter(initial_val)
+{
+    // Check to make sure initial value doesn't exceed the max counter value.
+    if (initial_val > maxVal) {
+        panic("BP: Initial counter value exceeds max size.");
+    }
+}
+
+void
+TournamentBP::SatCounter::increment()
+{
+    if (counter < maxVal) {
+        ++counter;
+    }
+}
+
+void
+TournamentBP::SatCounter::decrement()
+{
+    if (counter > 0) {
+        --counter;
+    }
+}
+
+TournamentBP::TournamentBP(unsigned _local_predictor_size,
+                           unsigned _local_ctr_bits,
+                           unsigned _local_history_table_size,
+                           unsigned _local_history_bits,
+                           unsigned _global_predictor_size,
+                           unsigned _global_ctr_bits,
+                           unsigned _global_history_bits,
+                           unsigned _choice_predictor_size,
+                           unsigned _choice_ctr_bits,
+                           unsigned _instShiftAmt)
+    : local_predictor_size(_local_predictor_size),
+      local_ctr_bits(_local_ctr_bits),
+      local_history_table_size(_local_history_table_size),
+      local_history_bits(_local_history_bits),
+      global_predictor_size(_global_predictor_size),
+      global_ctr_bits(_global_ctr_bits),
+      global_history_bits(_global_history_bits),
+      choice_predictor_size(_global_predictor_size),
+      choice_ctr_bits(_choice_ctr_bits),
+      instShiftAmt(_instShiftAmt)
+{
+    //Should do checks here to make sure sizes are correct (powers of 2)
+
+    //Setup the array of counters for the local predictor
+    local_ctrs = new SatCounter[local_predictor_size](local_ctr_bits);
+    //Setup the history table for the local table
+    local_history_table = new unsigned[local_history_table_size](0);
+    // Setup the local history mask
+    localHistoryMask = (1 << local_history_bits) - 1;
+
+    //Setup the array of counters for the global predictor
+    global_ctrs = new SatCounter[global_predictor_size](global_ctr_bits);
+    //Clear the global history
+    global_history = 0;
+    // Setup the global history mask
+    globalHistoryMask = (1 << global_history_bits) - 1;
+
+    //Setup the array of counters for the choice predictor
+    choice_ctrs = new SatCounter[choice_predictor_size](choice_ctr_bits);
+
+    threshold = (1 << (local_ctr_bits - 1)) - 1;
+    threshold = threshold / 2;
+}
+
+inline
+unsigned
+TournamentBP::calcLocHistIdx(Addr &branch_addr)
+{
+    return (branch_addr >> instShiftAmt) & (local_history_table_size - 1);
+}
+
+inline
+void
+TournamentBP::updateHistoriesTaken(unsigned local_history_idx)
+{
+    global_history = (global_history << 1) | 1;
+    global_history = global_history & globalHistoryMask;
+
+    local_history_table[local_history_idx] =
+        (local_history_table[local_history_idx] << 1) | 1;
+}
+
+inline
+void
+TournamentBP::updateHistoriesNotTaken(unsigned local_history_idx)
+{
+    global_history = (global_history << 1);
+    global_history = global_history & globalHistoryMask;
+
+    local_history_table[local_history_idx] =
+        (local_history_table[local_history_idx] << 1);
+}
+
+bool
+TournamentBP::lookup(Addr &branch_addr)
+{
+    uint8_t local_prediction;
+    unsigned local_history_idx;
+    unsigned local_predictor_idx;
+
+    uint8_t global_prediction;
+    uint8_t choice_prediction;
+
+    //Lookup in the local predictor to get its branch prediction
+    local_history_idx = calcLocHistIdx(branch_addr);
+    local_predictor_idx = local_history_table[local_history_idx]
+        & localHistoryMask;
+    local_prediction = local_ctrs[local_predictor_idx].read();
+
+    //Lookup in the global predictor to get its branch prediction
+    global_prediction = global_ctrs[global_history].read();
+
+    //Lookup in the choice predictor to see which one to use
+    choice_prediction = choice_ctrs[global_history].read();
+
+    //@todo Put a threshold value in for the three predictors that can
+    // be set through the constructor (so this isn't hard coded).
+    //Also should put some of this code into functions.
+    if (choice_prediction > threshold) {
+        if (global_prediction > threshold) {
+            updateHistoriesTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].increment();
+            local_ctrs[local_history_idx].increment();
+
+            return true;
+        } else {
+            updateHistoriesNotTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].decrement();
+            local_ctrs[local_history_idx].decrement();
+
+            return false;
+        }
+    } else {
+        if (local_prediction > threshold) {
+            updateHistoriesTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].increment();
+            local_ctrs[local_history_idx].increment();
+
+            return true;
+        } else {
+            updateHistoriesNotTaken(local_history_idx);
+
+            assert(global_history < global_predictor_size &&
+                   local_history_idx < local_predictor_size);
+
+            global_ctrs[global_history].decrement();
+            local_ctrs[local_history_idx].decrement();
+
+            return false;
+        }
+    }
+}
+
+// Update the branch predictor if it predicted a branch wrong.
+void
+TournamentBP::update(Addr &branch_addr, unsigned correct_gh, bool taken)
+{
+
+    uint8_t local_prediction;
+    unsigned local_history_idx;
+    unsigned local_predictor_idx;
+    bool local_pred_taken;
+
+    uint8_t global_prediction;
+    bool global_pred_taken;
+
+    // Load the correct global history into the register.
+    global_history = correct_gh;
+
+    // Get the local predictor's current prediction, remove the incorrect
+    // update, and update the local predictor
+    local_history_idx = calcLocHistIdx(branch_addr);
+    local_predictor_idx = local_history_table[local_history_idx];
+    local_predictor_idx = (local_predictor_idx >> 1) & localHistoryMask;
+
+    local_prediction = local_ctrs[local_predictor_idx].read();
+    local_pred_taken = local_prediction > threshold;
+
+    //Get the global predictor's current prediction, and update the
+    //global predictor
+    global_prediction = global_ctrs[global_history].read();
+    global_pred_taken = global_prediction > threshold;
+
+    //Update the choice predictor to tell it which one was correct
+    if (local_pred_taken != global_pred_taken) {
+        //If the local prediction matches the actual outcome, decerement
+        //the counter.  Otherwise increment the counter.
+        if (local_pred_taken == taken) {
+            choice_ctrs[global_history].decrement();
+        } else {
+            choice_ctrs[global_history].increment();
+        }
+    }
+
+    if (taken) {
+        assert(global_history < global_predictor_size &&
+               local_predictor_idx < local_predictor_size);
+
+        local_ctrs[local_predictor_idx].increment();
+        global_ctrs[global_history].increment();
+
+        global_history = (global_history << 1) | 1;
+        global_history = global_history & globalHistoryMask;
+
+        local_history_table[local_history_idx] |= 1;
+    }
+    else {
+        assert(global_history < global_predictor_size &&
+               local_predictor_idx < local_predictor_size);
+
+        local_ctrs[local_predictor_idx].decrement();
+        global_ctrs[global_history].decrement();
+
+        global_history = (global_history << 1);
+        global_history = global_history & globalHistoryMask;
+
+        local_history_table[local_history_idx] &= ~1;
+    }
+}
--- a/cpu/beta_cpu/tournament_pred.hh
+++ b/cpu/beta_cpu/tournament_pred.hh
@ -0,0 +1,160 @@
+#ifndef __TOURNAMENT_PRED_HH__
+#define __TOURNAMENT_PRED_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+class TournamentBP
+{
+  public:
+    /**
+     * Default branch predictor constructor.
+     */
+    TournamentBP(unsigned local_predictor_size,
+                 unsigned local_ctr_bits,
+                 unsigned local_history_table_size,
+                 unsigned local_history_bits,
+                 unsigned global_predictor_size,
+                 unsigned global_history_bits,
+                 unsigned global_ctr_bits,
+                 unsigned choice_predictor_size,
+                 unsigned choice_ctr_bits,
+                 unsigned instShiftAmt);
+
+    /**
+     * Looks up the given address in the branch predictor and returns
+     * a true/false value as to whether it is taken.
+     * @param branch_addr The address of the branch to look up.
+     * @return Whether or not the branch is taken.
+     */
+    bool lookup(Addr &branch_addr);
+
+    /**
+     * Updates the branch predictor with the actual result of a branch.
+     * @param branch_addr The address of the branch to update.
+     * @param taken Whether or not the branch was taken.
+     */
+    void update(Addr &branch_addr, unsigned global_history, bool taken);
+
+    inline unsigned readGlobalHist() { return global_history; }
+
+  private:
+
+    inline bool getPrediction(uint8_t &count);
+
+    inline unsigned calcLocHistIdx(Addr &branch_addr);
+
+    inline void updateHistoriesTaken(unsigned local_history_idx);
+
+    inline void updateHistoriesNotTaken(unsigned local_history_idx);
+
+    /**
+     * Private counter class for the internal saturating counters.
+     * Implements an n bit saturating counter and provides methods to
+     * increment, decrement, and read it.
+     * @todo Consider making this something that more closely mimics a
+     * built in class so you can use ++ or --.
+     */
+    class SatCounter
+    {
+      public:
+        /**
+         * Constructor for the counter.
+         * @param bits How many bits the counter will have.
+         */
+        SatCounter(unsigned bits);
+
+        /**
+         * Constructor for the counter.
+         * @param bits How many bits the counter will have.
+         * @param initial_val Starting value for each counter.
+         */
+        SatCounter(unsigned bits, unsigned initial_val);
+
+        /**
+         * Increments the counter's current value.
+         */
+        void increment();
+
+        /**
+         * Decrements the counter's current value.
+         */
+        void decrement();
+
+        /**
+         * Read the counter's value.
+         */
+        uint8_t read()
+        {
+            return counter;
+        }
+
+      private:
+        uint8_t maxVal;
+        uint8_t counter;
+    };
+
+    /** Local counters. */
+    SatCounter *local_ctrs;
+
+    /** Size of the local predictor. */
+    unsigned local_predictor_size;
+
+    /** Number of bits of the local predictor's counters. */
+    unsigned local_ctr_bits;
+
+    /** Array of local history table entries. */
+    unsigned *local_history_table;
+
+    /** Size of the local history table. */
+    unsigned local_history_table_size;
+
+    /** Number of bits for each entry of the local history table.
+     *  @todo Doesn't this come from the size of the local predictor?
+     */
+    unsigned local_history_bits;
+
+    /** Mask to get the proper local history. */
+    unsigned localHistoryMask;
+
+
+    /** Array of counters that make up the global predictor. */
+    SatCounter *global_ctrs;
+
+    /** Size of the global predictor. */
+    unsigned global_predictor_size;
+
+    /** Number of bits of the global predictor's counters. */
+    unsigned global_ctr_bits;
+
+    /** Global history register. */
+    unsigned global_history;
+
+    /** Number of bits for the global history. */
+    unsigned global_history_bits;
+
+    /** Mask to get the proper global history. */
+    unsigned globalHistoryMask;
+
+
+    /** Array of counters that make up the choice predictor. */
+    SatCounter *choice_ctrs;
+
+    /** Size of the choice predictor (identical to the global predictor). */
+    unsigned choice_predictor_size;
+
+    /** Number of bits of the choice predictor's counters. */
+    unsigned choice_ctr_bits;
+
+    /** Number of bits to shift the instruction over to get rid of the word
+     *  offset.
+     */
+    unsigned instShiftAmt;
+
+    /** Threshold for the counter value; above the threshold is taken,
+     *  equal to or below the threshold is not taken.
+     */
+    unsigned threshold;
+};
+
+#endif // __TOURNAMENT_PRED_HH__